1- Characterize health Ethiopian oral microbiota
2- Compare
healthy Ethiopian oral microbiota to other geographic populations
3-
Conduct cross sectional analysis of case and controls in Ethiopian
cohort
4- Compare ability to predict cancer status in external
cohorts
Ethiopian data is V4 16S rRNA sequencing performed on NextSeq 2000 P1 600 cycle (not XLEAP). See https://github.com/BisanzLab/OHMC_Colaboratory for wet lab methods. DNA was extracted using the Zymobiomics Mag96 extractions and libraries prepaired using the OHMC amplicon library method.
For QIIME2 processing script see
https://github.com/BisanzLab/OHMC_Colaboratory/blob/main/analysis_scripts/AmpliconSeq_q2.Rmd.
library(tidyverse)
library(readxl)
library(qiime2R)
library(vegan)
library(ape)
library(ggtree)
library(ALDEx2)
library(Biostrings)
library(cluster)
library(dada2)
theme_set(theme_q2r()) # this will ensure that figures are stored with appropriate fonts/aesthetics
sessionInfo()## R version 4.5.0 (2025-04-11)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 20.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0 LAPACK version 3.9.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: America/New_York
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] dada2_1.36.0 Rcpp_1.1.0 cluster_2.1.8.1
## [4] Biostrings_2.76.0 GenomeInfoDb_1.44.3 XVector_0.48.0
## [7] IRanges_2.42.0 S4Vectors_0.46.0 BiocGenerics_0.54.1
## [10] generics_0.1.4 ALDEx2_1.40.0 latticeExtra_0.6-31
## [13] lattice_0.22-7 zCompositions_1.5.0-5 survival_3.8-3
## [16] truncnorm_1.0-9 MASS_7.3-65 ggtree_3.99.1
## [19] ape_5.8-1 vegan_2.7-2 permute_0.9-8
## [22] qiime2R_0.99.6 readxl_1.4.5 lubridate_1.9.4
## [25] forcats_1.0.1 stringr_1.5.2 dplyr_1.1.4
## [28] purrr_1.1.0 readr_2.1.5 tidyr_1.3.1
## [31] tibble_3.3.0 ggplot2_4.0.1 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] RColorBrewer_1.1-3 rstudioapi_0.17.1
## [3] jsonlite_2.0.0 magrittr_2.0.4
## [5] farver_2.1.2 rmarkdown_2.30
## [7] fs_1.6.6 vctrs_0.6.5
## [9] multtest_2.64.0 Rsamtools_2.24.1
## [11] base64enc_0.1-3 htmltools_0.5.8.1
## [13] S4Arrays_1.8.1 cellranger_1.1.0
## [15] Rhdf5lib_1.30.0 SparseArray_1.8.1
## [17] Formula_1.2-5 rhdf5_2.52.1
## [19] gridGraphics_0.5-1 sass_0.4.10
## [21] bslib_0.9.0 htmlwidgets_1.6.4
## [23] plyr_1.8.9 cachem_1.1.0
## [25] GenomicAlignments_1.44.0 igraph_2.2.0
## [27] lifecycle_1.0.4 iterators_1.0.14
## [29] pkgconfig_2.0.3 Matrix_1.7-4
## [31] R6_2.6.1 fastmap_1.2.0
## [33] GenomeInfoDbData_1.2.14 MatrixGenerics_1.20.0
## [35] digest_0.6.37 aplot_0.2.9
## [37] colorspace_2.1-2 ShortRead_1.66.0
## [39] patchwork_1.3.2 Hmisc_5.2-4
## [41] GenomicRanges_1.60.0 hwriter_1.3.2.1
## [43] timechange_0.3.0 httr_1.4.7
## [45] abind_1.4-8 mgcv_1.9-3
## [47] compiler_4.5.0 fontquiver_0.2.1
## [49] withr_3.0.2 htmlTable_2.4.3
## [51] S7_0.2.0 backports_1.5.0
## [53] BiocParallel_1.42.2 DelayedArray_0.34.1
## [55] rappdirs_0.3.3 biomformat_1.36.0
## [57] tools_4.5.0 foreign_0.8-90
## [59] nnet_7.3-20 quadprog_1.5-8
## [61] glue_1.8.0 nlme_3.1-168
## [63] rhdf5filters_1.20.0 grid_4.5.0
## [65] checkmate_2.3.3 reshape2_1.4.4
## [67] ade4_1.7-23 gtable_0.3.6
## [69] tzdb_0.5.0 data.table_1.17.8
## [71] hms_1.1.4 foreach_1.5.2
## [73] pillar_1.11.1 yulab.utils_0.2.1
## [75] splines_4.5.0 treeio_1.32.0
## [77] deldir_2.0-4 directlabels_2025.6.24
## [79] tidyselect_1.2.1 fontLiberation_0.1.0
## [81] knitr_1.50 fontBitstreamVera_0.1.1
## [83] gridExtra_2.3 phyloseq_1.52.0
## [85] SummarizedExperiment_1.38.1 xfun_0.53
## [87] Biobase_2.68.0 matrixStats_1.5.0
## [89] DT_0.34.0 stringi_1.8.7
## [91] UCSC.utils_1.4.0 lazyeval_0.2.2
## [93] ggfun_0.2.0 yaml_2.3.10
## [95] evaluate_1.0.5 codetools_0.2-20
## [97] interp_1.1-6 gdtools_0.4.4
## [99] ggplotify_0.1.3 cli_3.6.5
## [101] RcppParallel_5.1.11-1 rpart_4.1.24
## [103] systemfonts_1.3.1 jquerylib_0.1.4
## [105] zigg_0.0.2 png_0.1-8
## [107] Rfast_2.1.5.2 parallel_4.5.0
## [109] jpeg_0.1-11 bitops_1.0-9
## [111] pwalign_1.4.0 tidytree_0.4.6
## [113] ggiraph_0.9.2 scales_1.4.0
## [115] crayon_1.5.3 rlang_1.1.6
Data was derived using v2.1 of script found here: https://github.com/BisanzLab/OHMC_Colaboratory/blob/main/analysis_scripts/AmpliconSeq_q2.Rmd.
First check negative and positive control read counts.
## EOSC_ExtCon_Plate1B2 EOSC_ExtCon_Plate1G10 EOSC_ExtCon_Plate2D3
## 601 629 856
## EOSC_ExtCon_Plate2G10 EOSC_ExtCon_Plate3C2 EOSC_ExtCon_Plate3D7
## 191 1137 641
## EOSC_ExtCon_Plate4B2 EOSC_ExtCon_Plate4E2 EOSC_ExtCon_Plate4F1
## 578 1592 560
## EOSC_ExtCon_Plate4F2 EOSC_ZymoCom_P2B10 EOSC_ZymoCom_P3C9
## 922 172785 34691
## EOSC_ZymoCom_P4C1 EOSC_ZymoCom_Plate1D7
## 126466 202605
There are <1592 reads in negative controls vs >34,691 in positive zymo controls. Will check what is found in negative controls below.
neg_contaminants<- asv_table[,subset(metadata, SampleType=="Negative Control")$SampleID] %>%
filter_features(1,1) %>% rownames()
asv_table[,subset(metadata, SampleType=="Negative Control")$SampleID] %>%
filter_features(1,1) %>%
as.data.frame() %>%
rownames_to_column("ASV") %>%
left_join(asv_taxonomy %>% dplyr::select(-Sequence)) %>%
interactive_table()All reads in extraction controls are tell-tale kit-ome of our current lot of reagents. To be safe, will use a 10,000 read threshold for successful sequencing.
Will also look for consistent contaminants.
asv_table[,subset(metadata, SampleType=="Negative Control")$SampleID] %>%
filter_features(1,1) %>%
as.data.frame() %>%
rownames_to_column("ASV") %>%
mutate_if(is.numeric, function(x) if_else(x>0,1,0)) %>%
UpSetR::upset(nsets = 10)The two conserved contaminants appear to be the same E. coli and Bordetella previously observed in this reagent batch.
Next will check composition of positive controls which are zymo community standard.
pos_contaminants<-
asv_table[,subset(metadata, SampleType=="Positive Control")$SampleID] %>%
filter_features(1,1) %>%
as.data.frame() %>%
rownames_to_column("ASV") %>%
left_join(asv_taxonomy %>% dplyr::select(-Sequence)) %>%
filter(!Genus %in% c("Bacillus","Enterococcus","Escherichia-Shigella","Limosilactobacillus","Pseudomonas","Salmonella","Staphylococcus","Listeria"))
colSums(pos_contaminants[,2:5])## EOSC_ZymoCom_P2B10 EOSC_ZymoCom_P3C9 EOSC_ZymoCom_P4C1
## 15 0 11
## EOSC_ZymoCom_Plate1D7
## 43
## [1] 17.25
## [1] 18.30073
i.e. in positive controls these putative contaminants make up less than 43 reads per sample.
asv_table[,subset(metadata, SampleType=="Positive Control")$SampleID] %>%
filter_features(1,1) %>%
as.data.frame() %>%
rownames_to_column("ASV") %>%
left_join(asv_taxonomy %>% dplyr::select(-Sequence)) %>%
interactive_table()Single/double digit contaminants that are mix of background, and either seq errors or trace cross-contamination in extractions.
pos_con<-
asv_table[,subset(metadata, SampleType=="Positive Control")$SampleID] %>%
filter_features(1,1)
summarize_taxa(pos_con, asv_taxonomy %>% column_to_rownames("ASV"))$Species %>%
taxa_barplot()Remove controls and filter for low read depth samples.
asv_table_unfiltered<-asv_table # save for later
metadata_unfiltered<-metadata
asv_table<-asv_table[,colSums(asv_table)>10000]
asv_table<-asv_table[,subset(metadata, SampleID %in% colnames(asv_table) & SampleType=="Saliva")$SampleID] %>% filter_features(1,1)
metadata<-metadata %>% filter(SampleID %in% colnames(asv_table)) %>% mutate(Group=factor(Group, levels=c("Healthy","Diseased")))
asv_table<-asv_table[,metadata$SampleID]
asv_taxonomy<-asv_taxonomy %>% filter(ASV %in% rownames(asv_table))
asv_tree<-keep.tip(asv_tree, rownames(asv_table))Double check tree for odd topology / problematic features. Will root
on Methanobrevibacteria for figures
c6cb8ffae3081efef011a521c8039722.
root(asv_tree, outgroup="c6cb8ffae3081efef011a521c8039722") %>%
ggtree() %<+% asv_taxonomy +
geom_tippoint(aes(color=Phylum))Summary of sample breakdown between healthy and controls in final dataset:
## # A tibble: 2 × 2
## Group n
## <fct> <int>
## 1 Healthy 108
## 2 Diseased 103
contaminant_reads<-
asv_table_unfiltered[unique(c(pos_contaminants$ASV, neg_contaminants)),] %>%
as.data.frame() %>%
rownames_to_column("ASV") %>%
pivot_longer(!ASV, names_to = "SampleID", values_to = "Abundance") %>%
left_join(metadata_unfiltered) %>%
filter(SampleType!="Exclude") %>%
filter(SampleType %in% c("Negative Control","Positive Control") | SampleID %in% metadata$SampleID) %>% #resync tables
mutate(SampleType=case_when(Group=="Diseased"~"Case", Group=="Healthy"~"Control", TRUE~SampleType)) %>%
group_by(ASV, SampleType, Group) %>%
summarize_at("Abundance", lst(mean,median,min,max,sd, length)) %>%
mutate(stat=paste0(median," [",min,"-",max,"]")) %>%
arrange(desc(median)) %>%
dplyr::select(ASV, SampleType, stat) %>%
pivot_wider(names_from = "SampleType", values_from = "stat")
contaminant_percent<-asv_table_unfiltered %>% make_percent()
contaminant_percent<-
contaminant_percent[unique(c(pos_contaminants$ASV, neg_contaminants)),] %>%
as.data.frame() %>%
rownames_to_column("ASV") %>%
pivot_longer(!ASV, names_to = "SampleID", values_to = "Abundance") %>%
left_join(metadata_unfiltered) %>%
filter(SampleType!="Exclude") %>%
filter(SampleType %in% c("Negative Control","Positive Control") | SampleID %in% metadata$SampleID) %>% #resync tables
mutate(SampleType=case_when(Group=="Diseased"~"Case", Group=="Healthy"~"Control", TRUE~SampleType)) %>%
group_by(ASV, SampleType, Group) %>%
summarize_at("Abundance", lst(mean,median,min,max,sd, length)) %>%
mutate_if(is.numeric, function(x) signif(x, digits=3)) %>%
mutate(stat=paste0(median," [",min,"-",max,"]")) %>%
arrange(desc(median)) %>%
dplyr::select(ASV, SampleType, stat) %>%
pivot_wider(names_from = "SampleType", values_from = "stat")
contaminant_reads %>%
full_join(contaminant_percent, by="ASV", suffix=c("_reads","_percent")) %>%
left_join(asv_taxonomy %>% dplyr::select(ASV, Kingdom, Family, Genus, Species)) %>%
interactive_table()## # A tibble: 2 × 6
## Group mean sd median min max
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Healthy 39.0 7.10 38 28 62
## 2 Diseased 51.5 13.8 50 18 105
##
## Wilcoxon rank sum test with continuity correction
##
## data: Age by Group
## W = 2182, p-value = 2.276e-14
## alternative hypothesis: true location shift is not equal to 0
## # A tibble: 4 × 3
## # Groups: Sex, Group [4]
## Sex Group n
## <chr> <fct> <int>
## 1 F Healthy 80
## 2 F Diseased 61
## 3 M Healthy 28
## 4 M Diseased 42
metadata %>% group_by(Sex, Group) %>% count() %>% pivot_wider(names_from = Group, values_from = n) %>% as.data.frame() %>% column_to_rownames("Sex") %>% fisher.test()##
## Fisher's Exact Test for Count Data
##
## data: .
## p-value = 0.02809
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 1.056013 3.682912
## sample estimates:
## odds ratio
## 1.960841
## # A tibble: 3 × 3
## # Groups: Smoking, Group [3]
## Smoking Group n
## <chr> <fct> <int>
## 1 No Healthy 108
## 2 No Diseased 100
## 3 Yes Diseased 3
metadata %>% group_by(Smoking, Group) %>% count() %>% pivot_wider(names_from = Group, values_from = n, values_fill = 0) %>% as.data.frame() %>% column_to_rownames("Smoking") %>% fisher.test()##
## Fisher's Exact Test for Count Data
##
## data: .
## p-value = 0.1146
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.4359386 Inf
## sample estimates:
## odds ratio
## Inf
## # A tibble: 3 × 3
## # Groups: Chewing tobacco, Group [3]
## `Chewing tobacco` Group n
## <chr> <fct> <int>
## 1 No Healthy 108
## 2 No Diseased 100
## 3 Yes Diseased 3
metadata %>% group_by(`Chewing tobacco`, Group) %>% count() %>% pivot_wider(names_from = Group, values_from = n, values_fill = 0) %>% as.data.frame() %>% column_to_rownames("Chewing tobacco") %>% fisher.test()##
## Fisher's Exact Test for Count Data
##
## data: .
## p-value = 0.1146
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.4359386 Inf
## sample estimates:
## odds ratio
## Inf
## # A tibble: 4 × 3
## # Groups: Alcohol Drink, Group [4]
## `Alcohol Drink` Group n
## <chr> <fct> <int>
## 1 No Healthy 88
## 2 No Diseased 97
## 3 Yes Healthy 20
## 4 Yes Diseased 6
metadata %>% group_by(`Alcohol Drink`, Group) %>% count() %>% pivot_wider(names_from = Group, values_from = n, values_fill = 0) %>% as.data.frame() %>% column_to_rownames("Alcohol Drink") %>% fisher.test()##
## Fisher's Exact Test for Count Data
##
## data: .
## p-value = 0.005992
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.08597284 0.74784904
## sample estimates:
## odds ratio
## 0.2737422
## # A tibble: 4 × 3
## # Groups: Khat use, Group [4]
## `Khat use` Group n
## <chr> <fct> <int>
## 1 No Healthy 104
## 2 No Diseased 98
## 3 Yes Healthy 4
## 4 Yes Diseased 5
metadata %>% group_by(`Khat use`, Group) %>% count() %>% pivot_wider(names_from = Group, values_from = n, values_fill = 0) %>% as.data.frame() %>% column_to_rownames("Khat use") %>% fisher.test()##
## Fisher's Exact Test for Count Data
##
## data: .
## p-value = 0.7435
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.2763203 6.8773063
## sample estimates:
## odds ratio
## 1.324753
## # A tibble: 3 × 3
## # Groups: Coffee drinking, Group [3]
## `Coffee drinking` Group n
## <chr> <fct> <int>
## 1 No Healthy 9
## 2 Yes Healthy 99
## 3 Yes Diseased 103
metadata %>% group_by(`Coffee drinking`, Group) %>% count() %>% pivot_wider(names_from = Group, values_from = n, values_fill = 0) %>% as.data.frame() %>% column_to_rownames("Coffee drinking") %>% fisher.test()##
## Fisher's Exact Test for Count Data
##
## data: .
## p-value = 0.003342
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 1.980536 Inf
## sample estimates:
## odds ratio
## Inf
dists<- asv_table %>% filter_features(1,1) %>% make_clr() %>% t() %>% dist(., method="euclidean")
mat <- as.matrix(dists)
rownames(mat) <- colnames(mat)
# Extract unique pairs (no diagonal, no duplicates)
idx <- which(upper.tri(mat), arr.ind = TRUE)
df <- data.frame(
sample1 = rownames(mat)[idx[,1]],
sample2 = colnames(mat)[idx[,2]],
distance = mat[idx]
)
df<-
df %>%
as_tibble() %>%
arrange(sample1) %>%
left_join(metadata %>% dplyr::select(sample1=SampleID, Pair1=Pair)) %>%
left_join(metadata %>% dplyr::select(sample2=SampleID, Pair2=Pair)) %>%
mutate(PairType=if_else(Pair1==Pair2 & Pair1!="Unpaired", "Intra-Pair", "Inter-Pair"))
#filter(PairType=="Intra-Pair") %>%
df %>%
ggplot(aes(x=PairType, y=distance)) +
geom_boxplot()##
## Welch Two Sample t-test
##
## data: distance by PairType
## t = -0.85565, df = 64.386, p-value = 0.3954
## alternative hypothesis: true difference in means between group Inter-Pair and group Intra-Pair is not equal to 0
## 95 percent confidence interval:
## -6.754442 2.703169
## sample estimates:
## mean in group Inter-Pair mean in group Intra-Pair
## 126.0124 128.0381
Samples belonging to cases and the individual who accompanied them to the clinic are not significantly more similar than un-associated individuals.
To examine batch effects we will only examine extraction plate as the remainder of the processing was performed on a single 384 well plate/sequencing run.
metadata %>% group_by(Group, ExtractionPlate) %>% count() %>% pivot_wider(names_from = "Group", values_from = "n") # no discrepancies by extraction plate.## # A tibble: 4 × 3
## # Groups: ExtractionPlate [4]
## ExtractionPlate Healthy Diseased
## <chr> <int> <int>
## 1 Plate1 38 38
## 2 Plate2 31 26
## 3 Plate3 36 37
## 4 Plate4 3 2
There is no evidence that controls vs cancer participants were unevenly distributed across plates.
batchalpha<-
asv_table %>% subsample_table() %>% t() %>% picante::pd(., asv_tree) %>%
rownames_to_column("SampleID") %>%
dplyr::select(SampleID, PhylogeneticDiversity="PD", ASV_Richness="SR") %>%
full_join(
asv_table %>% subsample_table() %>% t() %>% diversity(., "shannon") %>% data.frame(ShannonDiversity=.) %>% rownames_to_column("SampleID")
) %>%
inner_join(metadata) %>%
pivot_longer(c("PhylogeneticDiversity","ASV_Richness","ShannonDiversity"), names_to = "Metric", values_to = "Diversity")
batchalpha %>% group_by(Metric) %>% do(aov(Diversity~ExtractionPlate, data=.) %>% broom::tidy()) %>% knitr::kable()| Metric | term | df | sumsq | meansq | statistic | p.value |
|---|---|---|---|---|---|---|
| ASV_Richness | ExtractionPlate | 3 | 1.614791e+04 | 5.382638e+03 | 0.5204398 | 0.6686741 |
| ASV_Richness | Residuals | 207 | 2.140893e+06 | 1.034248e+04 | NA | NA |
| PhylogeneticDiversity | ExtractionPlate | 3 | 1.226250e+01 | 4.087502e+00 | 0.2525145 | 0.8594882 |
| PhylogeneticDiversity | Residuals | 207 | 3.350749e+03 | 1.618719e+01 | NA | NA |
| ShannonDiversity | ExtractionPlate | 3 | 2.089590e-02 | 6.965300e-03 | 0.0299437 | 0.9930066 |
| ShannonDiversity | Residuals | 207 | 4.815105e+01 | 2.326138e-01 | NA | NA |
batchalpha %>% ggplot(aes(x=ExtractionPlate, y=Diversity, fill=ExtractionPlate)) + geom_boxplot() + facet_wrap(~Metric, scales="free") + theme(legend.position = "none")No evidence of impact on diversity of communities. Will check community structure.
batchdist<-
summarize_taxa(asv_table, asv_taxonomy %>% column_to_rownames("ASV"))$Genus %>%
make_clr() %>%
t() %>%
dist(., method="euclidean")
batchpc<-ape::pcoa(batchdist)
batchpc$vectors %>%
as.data.frame() %>%
rownames_to_column("SampleID") %>%
left_join(metadata) %>%
ggplot(aes(x=Axis.1, y=Axis.2, fill=ExtractionPlate)) +
geom_point(shape=21, color="grey50") +
theme(legend.position="bottom") +
xlab(paste0("PC1: ",round(batchpc$values$Relative_eig[1]*100,2),"%")) +
ylab(paste0("PC2: ",round(batchpc$values$Relative_eig[2]*100,2),"%"))adonis2(batchdist~metadata[match(labels(batchdist), metadata$SampleID),]$ExtractionPlate) %>% print()## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = batchdist ~ metadata[match(labels(batchdist), metadata$SampleID), ]$ExtractionPlate)
## Df SumOfSqs R2 F Pr(>F)
## Model 3 4526 0.01591 1.1154 0.207
## Residual 207 279989 0.98409
## Total 210 284515 1.00000
fig1_meta<-metadata %>% filter(Group=="Healthy")
fig1_asvs<-asv_table[,fig1_meta$SampleID] %>% filter_features(1,1)fig1_adiversity<-
fig1_asvs %>% subsample_table() %>% t() %>% picante::pd(., asv_tree) %>%
rownames_to_column("SampleID") %>%
dplyr::select(SampleID, PhylogeneticDiversity="PD", ASV_Richness="SR") %>%
full_join(
fig1_asvs %>% subsample_table() %>% t() %>% diversity(., "shannon") %>% data.frame(ShannonDiversity=.) %>% rownames_to_column("SampleID")
) %>%
left_join(metadata)
interactive_table(fig1_adiversity)fig1_adiversity %>%
pivot_longer(c("PhylogeneticDiversity","ASV_Richness","ShannonDiversity"), names_to = "Metric", values_to = "Diversity") %>%
group_by(Metric) %>%
summarize_at("Diversity", lst(mean,median,sd,min,max)) %>%
interactive_table()fig1_adiversity %>%
pivot_longer(c("PhylogeneticDiversity","ASV_Richness","ShannonDiversity"), names_to = "Metric", values_to = "Diversity") %>%
ggplot(aes(x=Diversity)) +
geom_freqpoly(bins=20) +
facet_wrap(~Metric, scales="free") +
ylab("# Healhy Participants") +
xlab("Alpha Diversity")fams<-summarize_taxa(fig1_asvs, asv_taxonomy %>% column_to_rownames("ASV"))$Family %>% make_percent()
sample_order<-dist(t(fams)) %>% hclust(method="average") # average is upgma %>% plot()
sample_order<-sample_order$labels[sample_order$order]
q2r_palette <- c("blue4", "olivedrab", "firebrick", "gold",
"darkorchid", "steelblue2", "chartreuse1", "aquamarine",
"yellow3", "coral", "grey")
toplot<-rowMeans(fams) %>% sort(.,decreasing=TRUE) %>% names() %>% .[1:10]
fams %>%
as.data.frame() %>%
rownames_to_column("Taxon") %>%
pivot_longer(!Taxon) %>%
mutate(TaxonGroup=if_else(Taxon %in% toplot, Taxon, "Other")) %>%
group_by(TaxonGroup, name) %>%
summarize(value=sum(value)) %>%
ungroup() %>%
mutate(name=factor(name, levels=sample_order)) %>%
mutate(TaxonGroup=factor(TaxonGroup, levels=rev(c(toplot,"Other")))) %>%
ggplot(aes(x=name, y=value, fill=TaxonGroup)) +
geom_bar(stat="identity") +
scale_fill_manual(values=rev(q2r_palette), name="Family") +
coord_cartesian(ylim=c(0,100), expand = F) +
theme(axis.ticks.x = element_blank(), axis.text.x=element_blank()) +
xlab("Participant") +
ylab("% Abundance") +
theme(legend.text = element_text(size=3))Alternate reviewer figure showing “average” healthy composition. Will use the compositional average/center.
avgsamp<-
summarize_taxa(fig1_asvs, asv_taxonomy %>% column_to_rownames("ASV"))$Family %>%
make_clr() %>%
apply(., 1, mean) %>%
data.frame(CLR=.) %>%
rownames_to_column("Family") %>%
mutate(Percent=2^CLR/sum(2^CLR)*100) %>%
arrange(desc(Percent)) %>%
top_n(10, Percent)
avgsamp %>%
bind_rows(
tibble(Family="Other", CLR=NA, Percent=100-sum(avgsamp$Percent))
) %>%
mutate(Family=factor(Family, levels=Family) )%>%
ggplot(aes(x=1, y=Percent, fill=Family)) +
geom_col(color="black") +
coord_polar(theta="y") +
theme_void() +
scale_fill_manual(values=q2r_palette)In healthy groups check for sub structure/community state types using PAM with validation by gap statistic. Using genus abundances here to reduce dimensionality and for later compatibility.
fig1_asvs<-summarize_taxa(fig1_asvs, asv_taxonomy %>% column_to_rownames("ASV"))$Genus
dists<-list()
dists$`CLR Euclidean`<-fig1_asvs %>% make_clr() %>% t() %>% dist(., method="euclidean")
pcos<-lapply(dists, ape::pcoa)Metric="CLR Euclidean"
do_pam<-function(distmat, Nclusters){list(cluster = pam(distmat,Nclusters, cluster.only=TRUE))}
gapstats<-clusGap(pcos[[Metric]]$vectors, FUN=do_pam, K.max = 10, B=100) #maximum of 13 clusters (ie 2 per cluster)
gapstats<-
gapstats$Tab %>%
as.data.frame() %>%
mutate(Nclusters=1:nrow(.)) %>%
dplyr::select(Nclusters, GapStatistic=gap, SE=SE.sim)
ggplot(gapstats, aes(x=Nclusters, y=GapStatistic, ymin=GapStatistic-SE, ymax=GapStatistic+SE)) +
geom_errorbar(width=0) +
geom_point() +
scale_x_continuous(breaks=(1:10))lapply(names(pcos), function(x){
pcos[[x]]$values %>%
as.data.frame() %>%
mutate(Metric=x) %>%
mutate(Axis=1:nrow(.))
}) %>%
bind_rows() %>%
filter(Axis %in% 1:3) %>%
dplyr::select(Metric, Axis, everything()) %>%
interactive_table()pcos[[Metric]]$vectors %>%
as.data.frame() %>%
rownames_to_column("SampleID") %>%
left_join(metadata) %>%
left_join(clustered_metadata) %>%
ggplot(aes(x=Axis.1, y=Axis.2, fill=Cluster)) +
geom_point(shape=21, color="grey50") +
theme(legend.position="bottom") +
xlab(paste0("PC1: ",round(pcos[[Metric]]$values$Relative_eig[1]*100,2),"%")) +
ylab(paste0("PC2: ",round(pcos[[Metric]]$values$Relative_eig[2]*100,2),"%")) +
scale_fill_manual(values=c("darkorange","purple4"))ggsave("manuscript_figures/healthy_clreuc.pdf", height=2.3, width=2, useDingbats=F)
vegan::adonis2(dists[[Metric]]~Cluster, data=clustered_metadata[match(labels(dists[[Metric]]), clustered_metadata$SampleID),])## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
##
## vegan::adonis2(formula = dists[[Metric]] ~ Cluster, data = clustered_metadata[match(labels(dists[[Metric]]), clustered_metadata$SampleID), ])
## Df SumOfSqs R2 F Pr(>F)
## Model 1 24411 0.18321 23.776 0.001 ***
## Residual 106 108828 0.81679
## Total 107 133238 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## # A tibble: 2 × 2
## Cluster n
## <chr> <int>
## 1 Cluster_1 64
## 2 Cluster_2 44
fig1_adiversity %>%
left_join(clustered_metadata) %>%
ggplot(aes(x=Cluster, y=ShannonDiversity, fill=Cluster)) +
geom_boxplot() +
scale_fill_manual(values=c("darkorange","purple4")) +
theme(legend.position="none") +
coord_cartesian(ylim=c(2, 6))ggsave("manuscript_figures/healthy_clusters_diversity.pdf", height=2, width=1, useDingbats=F)
fig1_adiversity %>%
left_join(clustered_metadata) %>%
t.test(ShannonDiversity~Cluster, data=.)##
## Welch Two Sample t-test
##
## data: ShannonDiversity by Cluster
## t = 7.1238, df = 69.53, p-value = 7.657e-10
## alternative hypothesis: true difference in means between group Cluster_1 and group Cluster_2 is not equal to 0
## 95 percent confidence interval:
## 0.4302591 0.7649074
## sample estimates:
## mean in group Cluster_1 mean in group Cluster_2
## 4.357102 3.759519
results<-
fig1_asvs[,clustered_metadata$SampleID] %>%
filter_features(5,5) %>%
aldex(reads=., clustered_metadata$Cluster, include.sample.summary = TRUE)
cleaned_results<-
results %>% filter(we.eBH<0.1 & abs(diff.btw)>1) %>%
arrange(diff.btw) %>%
rownames_to_column("ASV") %>%
dplyr::select(ASV, Cluster1_CLR=rab.win.Cluster_1, Cluster2=rab.win.Cluster_2, log2FC=diff.btw, Pvalue=we.ep, FDR=we.eBH) %>%
arrange(log2FC)
interactive_table(cleaned_results)write_tsv(cleaned_results, "manuscript_figures/healthy_clusters_aldex.tsv")
results %>%
rownames_to_column("Taxon") %>%
mutate(Label=if_else(Taxon %in% cleaned_results$ASV, Taxon,"")) %>%
mutate(fc=case_when(
Label!="" & diff.btw>0 ~ "Cluster 2",
Label!="" & diff.btw<0 ~ "Cluster 1",
TRUE ~ "ns"
)) %>%
mutate(Label=gsub("..+\\;","", Label)) %>%
ggplot(aes(y=-log10(we.ep), x=diff.btw, color=fc)) +
geom_point(shape=16, alpha=0.6) +
ggrepel::geom_text_repel(aes(label=Label), size=1.8) +
xlab("log2(fold change) Cluster 2 vs Cluster 1") +
ylab("-log10(P-value)") +
scale_color_manual(values=c("darkorange","purple4", "grey50")) +
theme(legend.position="none")Derived using the V6 total dual-dye 16S qPCR and normalized based on a 1:1 dilution with preservative and 5mL of saliva.
clustered_qpcr<-
read_csv("EOSC_16S_qPCR.csv") %>%
dplyr::select(SampleID, Cq, `16S copies/mL`, `log10(16S copies/mL)`) %>%
filter(SampleID %in% clustered_metadata$SampleID) %>%
left_join(clustered_metadata)
interactive_table(clustered_qpcr)clustered_qpcr %>%
group_by(Cluster) %>%
summarize_at("log10(16S copies/mL)", lst(mean,median,sd,min,max, length)) %>% knitr::kable()| Cluster | mean | median | sd | min | max | length |
|---|---|---|---|---|---|---|
| Cluster_1 | 7.532746 | 7.700356 | 0.9862739 | 3.166367 | 8.739529 | 64 |
| Cluster_2 | 6.733028 | 6.679563 | 0.6072923 | 5.703809 | 7.959374 | 44 |
clustered_qpcr %>%
ggplot(aes(x=Cluster, y=`log10(16S copies/mL)`, fill=Cluster)) +
geom_boxplot() +
scale_fill_manual(values=c("darkorange","purple4")) +
theme(legend.position="none") +
coord_cartesian(ylim=c(3,10))ggsave("manuscript_figures/clustered_qPCR.pdf", height=2, width=1, useDingbats=F)
t.test(`log10(16S copies/mL)`~Cluster, data=clustered_qpcr) %>% broom::tidy()## # A tibble: 1 × 10
## estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.800 7.53 6.73 5.21 9.58e-7 105. 0.495 1.10
## # ℹ 2 more variables: method <chr>, alternative <chr>
tm<-asv_table %>% subsample_table()
tm %>% biomformat::make_biom() %>% biomformat::write_biom("asv_table.biom")
ta<-asv_taxonomy %>% filter(ASV %in% rownames(tm))
fa<-DNAStringSet(ta$Sequence)
names(fa)<-ta$ASV
writeXStringSet(fa, "asv_seqs.fa")
# see https://github.com/picrust/picrust2/wiki/Full-pipeline-script
conda activate picrust2 # picrust 2.4.1
picrust2_pipeline.py -s asv_seqs.fa -i asv_table.biom -o picrust2_out_strat -p 24 --stratified
add_descriptions.py -i picrust2_out_strat/pathways_out/path_abun_unstrat.tsv.gz -m METACYC -o picrust2_out_strat/pathways_w_desc.tsvpicrust<-read_tsv("picrust2_out_strat/pathways_w_desc.tsv")[,c("pathway","description", colnames(fig1_asvs))]
picrust<-picrust[rowSums(picrust[,3:ncol(picrust)])>0,]
picrust<-
picrust %>%
pivot_longer(!c(pathway, description), names_to = "SampleID", values_to = "Abundance") %>%
group_by(SampleID) %>%
mutate(Abundance=Abundance/sum(Abundance)) %>%
group_by(pathway, description) %>%
mutate(Abundance=log2(Abundance+(min_nonzero(Abundance)*(2/3)))) %>%
ungroup() %>%
left_join(clustered_metadata) %>%
mutate(Cluster=factor(Cluster, levels=c("Cluster_1","Cluster_2")))
picrust_results<-
picrust %>%
group_by(pathway, description) %>%
do(
t.test(Abundance~Cluster, data=.) %>%
broom::tidy()
) %>%
ungroup()
picrust_results<-
picrust_results %>%
dplyr::rename(log2FC=estimate, Cluster1=estimate1, Cluster2=estimate2) %>%
mutate(FDR=p.adjust(p.value, method="BH"))
interactive_table(picrust_results)picrust_results %>%
filter(FDR<0.1 & abs(log2FC)>1) %>%
arrange(desc(log2FC)) %>%
mutate(dir=if_else(Cluster1>Cluster2, "Cluster1", "Cluster2")) %>%
mutate(pathway=paste0(pathway, ": ", description)) %>%
mutate(pathway=factor(pathway, levels=rev(pathway))) %>%
ggplot(aes(x=pathway, y=log2FC, ymin=conf.low, ymax=conf.high, color=dir)) +
geom_hline(yintercept = 0, linetype="dashed", color="grey50") +
geom_errorbar(width=0) +
geom_point(size=1) +
coord_flip() +
scale_color_manual(values=c("darkorange","purple4")) +
theme(legend.position="none") +
theme(axis.text.y=element_text(size=5)) +
ylab("log2(fold change) (Cluster 1 vs Cluster 2") +
xlab("")ggsave("manuscript_figures/healthy_picrust_clusters.pdf", height=5, width=5, useDingbats=F)
rm(picrust, picrust_results)All Ethiopian data was reprocessed by Iyun for comparisons against the AGP and other cohorts which is described in Ademola-Popoola IJ, Abdul-Aziz MA, Ta CK, Barreiro LB, Perry G, Weyrich LS. Evolutionary Histories and Shared Environments Shape Oral Microbiomes in Ugandan Batwa and Bakiga Populations.
## [1] 1773 811
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 65 10096 15947 29593 25754 324344
## [1] 1773 748
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5019 11825 17100 31844 26766 324344
comp_meta<-read_tsv("inputs/weyrich_metadata.txt") %>% filter(sample_name %in% colnames(comp_table))
comp_meta %>% count(country)## # A tibble: 5 × 2
## country n
## <chr> <int>
## 1 AGP 492
## 2 Ethiopia 107
## 3 Tanzania 36
## 4 Uganda 97
## 5 Venezuela 16
comp_seqs<-read_qza("weyrichdata/Global_merged_rep_seqs.qza")$data
comp_seqs<-comp_seqs[rownames(comp_table)]
comp_taxonomy<-readRDS("inputs/weyrich_taxonomy.RDS")
comp_meta<-comp_meta %>% dplyr::select(SampleID=sample_name, Country=country)comp_sub<-subsample_table(comp_table)
comp_adiv<-comp_sub %>% t() %>% diversity(., "shannon") %>% data.frame(ShannonDiversity=.) %>% rownames_to_column("SampleID") %>% left_join(comp_meta)
interactive_table(comp_adiv)comp_adiv %>%
group_by(Country) %>%
summarize_at("ShannonDiversity", lst(mean,median,sd,min,max)) %>%
interactive_table()comp_adiv %>%
ggplot(aes(x=Country, y=ShannonDiversity, fill=Country)) +
geom_boxplot() +
ylab("Shannon Diversity") +
xlab("Country") +
theme(legend.position="none") +
coord_cartesian(ylim=c(0, 7))ggsave("manuscript_figures/comp_alpha_diversity.pdf", height=2, width=2.2, useDingbats=F)
fit<-comp_adiv %>% aov(ShannonDiversity~Country, data=.)
summary(fit)## Df Sum Sq Mean Sq F value Pr(>F)
## Country 4 357.6 89.40 266.4 <2e-16 ***
## Residuals 743 249.4 0.34
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
comp_dist<-summarize_taxa(comp_table, as.data.frame(comp_taxonomy))$Genus %>% make_clr() %>% t() %>% dist(., method="euclidean")
comp_pc<-pcoa(comp_dist)
comp_pc$vectors %>%
as.data.frame() %>%
rownames_to_column("SampleID") %>%
left_join(comp_meta) %>%
left_join(clustered_metadata %>% mutate(SampleID=gsub("_",".", SampleID))) %>%
mutate(Cluster=if_else(is.na(Cluster),"Other", Cluster)) %>%
#ggplot(aes(x=Axis.1, y=Axis.2, fill=Country)) +
ggplot(aes(x=Axis.1, y=Axis.2, fill=Country, shape=Cluster)) +
geom_point() +
#geom_point(shape=21) +
scale_shape_manual(values=c(24,25,21)) +
xlab(paste0("PC1: ",round(comp_pc$values$Relative_eig[1]*100,2),"%")) +
ylab(paste0("PC2: ",round(comp_pc$values$Relative_eig[2]*100,2),"%"))ggsave("manuscript_figures/comp_pcoa.pdf", height=2, width=3, useDingbats=F)
vegan::adonis2(comp_dist~Country, data=comp_meta[match(labels(comp_dist), comp_meta$SampleID),])## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
##
## vegan::adonis2(formula = comp_dist ~ Country, data = comp_meta[match(labels(comp_dist), comp_meta$SampleID), ])
## Df SumOfSqs R2 F Pr(>F)
## Model 4 218158 0.33151 92.113 0.001 ***
## Residual 743 439925 0.66849
## Total 747 658083 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
comp_results<-list()
for(f in unique(comp_meta$Country)){
if(f=="Ethiopia"){next}
message(f)
tm<-subset(comp_meta, Country %in% c(f,"Ethiopia")) %>% mutate(Country=if_else(Country=="Ethiopia", "z_Ethiopia", Country))
tf<-comp_table[,tm$SampleID] %>% filter_features(10,10)
tf<-summarize_taxa(tf, comp_taxonomy %>% as.data.frame())$Family
comp_results[[f]]<-aldex(reads=tf, conditions = tm$Country, mc.samples = 128) %>% rownames_to_column("Taxon") %>% mutate(Contrast=paste("Ethiopia vs", f))
}
rm(tm, tf)
comp_results %>%
bind_rows() %>%
interactive_table()comp_results %>%
bind_rows() %>%
mutate(Sig=if_else(abs(diff.btw)>1 & we.eBH<0.1, "*","ns")) %>%
mutate(Label=if_else(Sig=="*", gsub("..+; ","", Taxon) %>% gsub("NA","", .), "")) %>%
mutate(Pval=we.ep+min_nonzero(we.ep)) %>% # to get these on the plot
ggplot(aes(x=diff.btw, y=-log10(Pval), color=Sig)) +
geom_point(shape=16, alpha=0.5) +
facet_wrap(~Contrast, ncol=2, scales="free") +
xlab("log2Fold Change (Ethiopia vs Cohort)") +
scale_color_manual(values=c("indianred","grey50")) +
ggrepel::geom_text_repel(aes(label=Label), size=2) +
theme(legend.position="none") +
ylab("-log10(P-value)")covariates<-
metadata %>%
left_join(read_excel("inputs/mycotoxins.xlsx")) %>%
dplyr::select(SampleID,
Group,
Sex,
Age,
CigaretteUse=Smoking,
ChewingTobaccoUse=`Chewing tobacco`,
AlcoholUse=`Alcohol Drink`,
KhatUse=`Khat use`,
CoffeeUse=`Coffee drinking`,
Mycotoxin_CIT=CIT,
Mycotoxin_DON=DON,
Mycotoxin_OTA=OTA,
Mycotoxin_TA=TA
) %>%
left_join(
read_excel("inputs/mycotoxins.xlsx") %>%
column_to_rownames("SampleID") %>%
apply(., 1, function(x) sum(x>0)) %>%
data.frame(Mycotoxin_TotalDetected=.) %>%
rownames_to_column("SampleID")
) %>%
left_join(
read_excel("inputs/mycotoxins.xlsx") %>%
column_to_rownames("SampleID") %>%
apply(., 1, function(x) sum(x)) %>%
data.frame(Mycotoxin_TotalConcentration=.) %>%
rownames_to_column("SampleID")
)
interactive_table(covariates)nonzero<-function(x){sum(x>0)}
covariates %>%
pivot_longer(contains("Mycotoxin"), names_to = "Metric", values_to="conc_ugL") %>%
group_by(Group, Metric) %>%
dplyr::select(-Age) %>%
summarize_if(is.numeric, lst(mean,sd,median,min,max, length, nonzero)) %>%
arrange(Metric) %>%
mutate(Mean_SD=paste0(mean, " ± ", sd),
median_range=paste0(median, " [",min," - ",max, "]")) %>%
dplyr::select(Group, Metric, Mean_SD, median_range, nonzero) %>%
interactive_table()covariates %>%
pivot_longer(contains("Mycotoxin"), names_to = "Metric", values_to="conc_ugL") %>%
group_by(Metric) %>%
do(
wilcox.test(conc_ugL~Group, data=.) %>%
broom::tidy()
) %>%
interactive_table()covariates %>%
pivot_longer(contains("Mycotoxin"), names_to = "Metric", values_to="conc_ugL") %>%
group_by(Group, Metric) %>%
dplyr::select(-Age) %>%
summarize_if(is.numeric, lst(mean,sd,median,min,max, length, detected=nonzero)) %>%
ungroup() %>%
mutate(notdetected=length-detected) %>%
dplyr::select(Group, Metric, detected, notdetected) %>%
group_by(Metric) %>%
summarize(Fisher_Pvalue=fisher.test(matrix(c(detected, notdetected), nrow =2))$p)## # A tibble: 6 × 2
## Metric Fisher_Pvalue
## <chr> <dbl>
## 1 Mycotoxin_CIT 2.81e- 4
## 2 Mycotoxin_DON 7.32e- 1
## 3 Mycotoxin_OTA 1 e+ 0
## 4 Mycotoxin_TA 4.59e-12
## 5 Mycotoxin_TotalConcentration 1 e+ 0
## 6 Mycotoxin_TotalDetected 1 e+ 0
Dropping CIT as co-variate for healthy cohort as not found in healthy group. Also, dropping total mycotoxin going forward as it is highly co-linear with the levels of TA.
Finding a model that minimizes the AIC of the model.
alpha_cov<-
fig1_adiversity %>%
dplyr::select(SampleID, PhylogeneticDiversity, ASV_Richness, ShannonDiversity) %>%
left_join(covariates) %>%
filter(Group=="Healthy")
fit<-glm(ShannonDiversity~Sex + Age + AlcoholUse + KhatUse + CoffeeUse + Mycotoxin_DON + Mycotoxin_OTA + Mycotoxin_TA + Mycotoxin_TotalDetected, data=alpha_cov)
summary(fit) # 161.05##
## Call:
## glm(formula = ShannonDiversity ~ Sex + Age + AlcoholUse + KhatUse +
## CoffeeUse + Mycotoxin_DON + Mycotoxin_OTA + Mycotoxin_TA +
## Mycotoxin_TotalDetected, data = alpha_cov)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.8777141 0.3459136 11.210 <2e-16 ***
## SexM 0.2387724 0.1230321 1.941 0.0552 .
## Age -0.0039520 0.0080251 -0.492 0.6235
## AlcoholUseYes -0.2882365 0.1409137 -2.045 0.0435 *
## KhatUseYes 0.1995013 0.2513512 0.794 0.4293
## CoffeeUseYes 0.1761844 0.1702886 1.035 0.3034
## Mycotoxin_DON -0.1991542 0.6554466 -0.304 0.7619
## Mycotoxin_OTA 0.0461860 0.0339663 1.360 0.1770
## Mycotoxin_TA 0.0006513 0.0019799 0.329 0.7429
## Mycotoxin_TotalDetected 0.0913024 0.0835819 1.092 0.2773
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.23381)
##
## Null deviance: 26.220 on 107 degrees of freedom
## Residual deviance: 22.913 on 98 degrees of freedom
## AIC: 161.05
##
## Number of Fisher Scoring iterations: 2
#khat use only 4 individuals, remove
fit<-glm(ShannonDiversity~Sex + Age + AlcoholUse + CoffeeUse + Mycotoxin_DON + Mycotoxin_OTA + Mycotoxin_TA + Mycotoxin_TotalDetected, data=alpha_cov)
summary(fit) # 159.74##
## Call:
## glm(formula = ShannonDiversity ~ Sex + Age + AlcoholUse + CoffeeUse +
## Mycotoxin_DON + Mycotoxin_OTA + Mycotoxin_TA + Mycotoxin_TotalDetected,
## data = alpha_cov)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.8520789 0.3437583 11.206 <2e-16 ***
## SexM 0.2344495 0.1226815 1.911 0.0589 .
## Age -0.0036321 0.0080000 -0.454 0.6508
## AlcoholUseYes -0.2864388 0.1406320 -2.037 0.0443 *
## CoffeeUseYes 0.1854984 0.1695660 1.094 0.2766
## Mycotoxin_DON -0.1966582 0.6542131 -0.301 0.7643
## Mycotoxin_OTA 0.0441414 0.0338051 1.306 0.1947
## Mycotoxin_TA 0.0007249 0.0019741 0.367 0.7143
## Mycotoxin_TotalDetected 0.0991840 0.0828347 1.197 0.2340
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.2329361)
##
## Null deviance: 26.220 on 107 degrees of freedom
## Residual deviance: 23.061 on 99 degrees of freedom
## AIC: 159.74
##
## Number of Fisher Scoring iterations: 2
#similarly, only 9 people don't consume coffee
fit<-glm(ShannonDiversity~Sex + Age + AlcoholUse + Mycotoxin_DON + Mycotoxin_OTA + Mycotoxin_TA + Mycotoxin_TotalDetected, data=alpha_cov)
summary(fit) # 159.04##
## Call:
## glm(formula = ShannonDiversity ~ Sex + Age + AlcoholUse + Mycotoxin_DON +
## Mycotoxin_OTA + Mycotoxin_TA + Mycotoxin_TotalDetected, data = alpha_cov)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.9970994 0.3174811 12.590 <2e-16 ***
## SexM 0.2367042 0.1227848 1.928 0.0567 .
## Age -0.0029925 0.0079864 -0.375 0.7087
## AlcoholUseYes -0.2968261 0.1404490 -2.113 0.0371 *
## Mycotoxin_DON -0.2012688 0.6548428 -0.307 0.7592
## Mycotoxin_OTA 0.0471674 0.0337249 1.399 0.1650
## Mycotoxin_TA 0.0005965 0.0019725 0.302 0.7630
## Mycotoxin_TotalDetected 0.0994142 0.0829159 1.199 0.2334
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.2333944)
##
## Null deviance: 26.220 on 107 degrees of freedom
## Residual deviance: 23.339 on 100 degrees of freedom
## AIC: 159.04
##
## Number of Fisher Scoring iterations: 2
#Mycotoxin_TotalDetected is an integer which might cause some issues, will remove
fit<-glm(ShannonDiversity~Sex + Age + AlcoholUse + Mycotoxin_DON + Mycotoxin_OTA + Mycotoxin_TA, data=alpha_cov)
summary(fit) # 158.58##
## Call:
## glm(formula = ShannonDiversity ~ Sex + Age + AlcoholUse + Mycotoxin_DON +
## Mycotoxin_OTA + Mycotoxin_TA, data = alpha_cov)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.106489 0.304747 13.475 <2e-16 ***
## SexM 0.230503 0.122941 1.875 0.0637 .
## Age -0.001822 0.007944 -0.229 0.8190
## AlcoholUseYes -0.287484 0.140536 -2.046 0.0434 *
## Mycotoxin_DON 0.030134 0.627109 0.048 0.9618
## Mycotoxin_OTA 0.046432 0.033792 1.374 0.1725
## Mycotoxin_TA 0.001593 0.001793 0.889 0.3762
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.2344055)
##
## Null deviance: 26.220 on 107 degrees of freedom
## Residual deviance: 23.675 on 101 degrees of freedom
## AIC: 158.58
##
## Number of Fisher Scoring iterations: 2
#Minimize to just obvious variables with larger coefficients
fit<-glm(ShannonDiversity~Sex + Age + AlcoholUse, data=alpha_cov)
summary(fit) # 155.62##
## Call:
## glm(formula = ShannonDiversity ~ Sex + Age + AlcoholUse, data = alpha_cov)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.247792 0.292244 14.535 <2e-16 ***
## SexM 0.243276 0.120092 2.026 0.0454 *
## Age -0.003609 0.007856 -0.459 0.6469
## AlcoholUseYes -0.305634 0.139361 -2.193 0.0305 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.2341533)
##
## Null deviance: 26.220 on 107 degrees of freedom
## Residual deviance: 24.352 on 104 degrees of freedom
## AIC: 155.62
##
## Number of Fisher Scoring iterations: 2
##
## Call:
## glm(formula = ShannonDiversity ~ Sex + AlcoholUse, data = alpha_cov)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.11599 0.05541 74.283 <2e-16 ***
## SexM 0.22650 0.11397 1.987 0.0495 *
## AlcoholUseYes -0.32979 0.12858 -2.565 0.0117 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.2323939)
##
## Null deviance: 26.220 on 107 degrees of freedom
## Residual deviance: 24.401 on 105 degrees of freedom
## AIC: 153.84
##
## Number of Fisher Scoring iterations: 2
Below, will show all tested coefficients, but will only report sex and alcohol use as significant based on final model.
fit<-glm(ShannonDiversity~Sex + Age + AlcoholUse + KhatUse + CoffeeUse + Mycotoxin_DON + Mycotoxin_OTA + Mycotoxin_TA + Mycotoxin_TotalDetected, data=alpha_cov)
summary(fit)##
## Call:
## glm(formula = ShannonDiversity ~ Sex + Age + AlcoholUse + KhatUse +
## CoffeeUse + Mycotoxin_DON + Mycotoxin_OTA + Mycotoxin_TA +
## Mycotoxin_TotalDetected, data = alpha_cov)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.8777141 0.3459136 11.210 <2e-16 ***
## SexM 0.2387724 0.1230321 1.941 0.0552 .
## Age -0.0039520 0.0080251 -0.492 0.6235
## AlcoholUseYes -0.2882365 0.1409137 -2.045 0.0435 *
## KhatUseYes 0.1995013 0.2513512 0.794 0.4293
## CoffeeUseYes 0.1761844 0.1702886 1.035 0.3034
## Mycotoxin_DON -0.1991542 0.6554466 -0.304 0.7619
## Mycotoxin_OTA 0.0461860 0.0339663 1.360 0.1770
## Mycotoxin_TA 0.0006513 0.0019799 0.329 0.7429
## Mycotoxin_TotalDetected 0.0913024 0.0835819 1.092 0.2773
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.23381)
##
## Null deviance: 26.220 on 107 degrees of freedom
## Residual deviance: 22.913 on 98 degrees of freedom
## AIC: 161.05
##
## Number of Fisher Scoring iterations: 2
summary(fit)$coefficients %>%
as.data.frame() %>%
rownames_to_column("Covariate") %>%
filter(Covariate!="(Intercept)") %>%
arrange(Estimate) %>%
mutate(Covariate=factor(Covariate, levels=Covariate)) %>%
ggplot(aes(x=Covariate, y=Estimate, ymin=Estimate-`Std. Error`, ymax=Estimate+`Std. Error`, label=paste("P=", round(`Pr(>|t|)`,2)))) +
geom_hline(yintercept = 0, linetype="dashed", color="grey50") +
geom_point() +
geom_errorbar(width=0) +
geom_text(size=2) +
coord_flip() +
ylab("Effect on Shannon's Diversity (Estimate ± SE)")alpha_cov %>%
ggplot(aes(x=Sex, y=ShannonDiversity, fill=AlcoholUse)) +
#geom_boxplot(outlier.alpha=0,) +
stat_summary(geom="bar", position=position_dodge(), color="black") +
geom_point(position=position_jitterdodge(jitter.width=0.2, jitter.height=0), shape=21) +
coord_cartesian(ylim=c(2, 6)) +
scale_fill_manual(values=c("grey50","indianred"))ggsave("manuscript_figures/sex_alcohol.pdf", height=2, width=2.5, useDingbats=F)
fit<-alpha_cov %>% aov(ShannonDiversity~AlcoholUse*Sex, data=.)
summary(fit)## Df Sum Sq Mean Sq F value Pr(>F)
## AlcoholUse 1 0.900 0.9005 3.840 0.0527 .
## Sex 1 0.918 0.9178 3.914 0.0505 .
## AlcoholUse:Sex 1 0.012 0.0116 0.049 0.8245
## Residuals 104 24.390 0.2345
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
vegan::adonis2(dists$`CLR Euclidean`~Sex + Age + AlcoholUse + KhatUse + CoffeeUse + Mycotoxin_DON + Mycotoxin_OTA + Mycotoxin_TA + Mycotoxin_TotalDetected,
data=covariates[match(labels(dists$`CLR Euclidean`), covariates$SampleID),],by="terms", permutations=9999, parallel=10) %>% print()## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 9999
##
## vegan::adonis2(formula = dists$`CLR Euclidean` ~ Sex + Age + AlcoholUse + KhatUse + CoffeeUse + Mycotoxin_DON + Mycotoxin_OTA + Mycotoxin_TA + Mycotoxin_TotalDetected, data = covariates[match(labels(dists$`CLR Euclidean`), covariates$SampleID), ], permutations = 9999, by = "terms", parallel = 10)
## Df SumOfSqs R2 F Pr(>F)
## Sex 1 2370 0.01778 1.9098 0.0320 *
## Age 1 996 0.00748 0.8031 0.6750
## AlcoholUse 1 997 0.00748 0.8035 0.6759
## KhatUse 1 1380 0.01036 1.1121 0.2786
## CoffeeUse 1 835 0.00627 0.6728 0.8852
## Mycotoxin_DON 1 1117 0.00838 0.9000 0.5290
## Mycotoxin_OTA 1 2082 0.01563 1.6784 0.0563 .
## Mycotoxin_TA 1 992 0.00744 0.7993 0.6790
## Mycotoxin_TotalDetected 1 874 0.00656 0.7046 0.8412
## Residual 98 121596 0.91262
## Total 107 133238 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pcos$`CLR Euclidean`$vectors %>%
as.data.frame() %>%
rownames_to_column("SampleID") %>%
left_join(metadata) %>%
left_join(clustered_metadata) %>%
ggplot(aes(x=Sex, y=Axis.2, fill=Sex)) +
geom_boxplot() +
#stat_summary(geom="bar", position=position_dodge(), color="black") +
geom_point(position=position_jitterdodge(jitter.width=0.2, jitter.height=0), shape=21) +
ylab(paste0("PC2: ",round(pcos$`CLR Euclidean`$values$Relative_eig[2]*100,2),"%")) +
coord_cartesian(ylim=c(-20, 35)) +
scale_fill_manual(values=c("grey50","dodgerblue"))ggsave("manuscript_figures/sex.pdf", heigh=2, width=2, useDingbats=F)
pcos$`CLR Euclidean`$vectors %>%
as.data.frame() %>%
rownames_to_column("SampleID") %>%
left_join(metadata) %>%
left_join(clustered_metadata) %>%
t.test(Axis.2~Sex, data=.)##
## Welch Two Sample t-test
##
## data: Axis.2 by Sex
## t = -2.5498, df = 37.359, p-value = 0.01501
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
## -10.29551 -1.17964
## sample estimates:
## mean in group F mean in group M
## -1.487520 4.250057
adiversity<-
asv_table %>% subsample_table() %>% t() %>% picante::pd(., asv_tree) %>%
rownames_to_column("SampleID") %>%
dplyr::select(SampleID, PhylogeneticDiversity="PD", ASV_Richness="SR") %>%
full_join(
asv_table %>% subsample_table() %>% t() %>% diversity(., "shannon") %>% data.frame(ShannonDiversity=.) %>% rownames_to_column("SampleID")
) %>%
left_join(metadata)
interactive_table(adiversity)adiversity %>%
pivot_longer(c("PhylogeneticDiversity","ASV_Richness","ShannonDiversity"), names_to = "Metric", values_to = "Diversity") %>%
group_by(Metric, Group) %>%
summarize_at("Diversity", lst(mean,median,sd,min,max)) %>%
interactive_table()adiversity %>%
pivot_longer(c("PhylogeneticDiversity","ASV_Richness","ShannonDiversity"), names_to = "Metric", values_to = "Diversity") %>%
ggplot(aes(x=Group, y=Diversity, fill=Group)) +
geom_boxplot() +
facet_wrap(~Metric, scales="free") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
theme(legend.position="none") +
xlab("") +
ylab("") +
scale_fill_manual(values=c("cornflowerblue","indianred"))ggsave("manuscript_figures/cross_section_alpha_diversity.pdf", height=2, width=3, useDingbats=F)
adiversity %>%
pivot_longer(c("PhylogeneticDiversity","ASV_Richness","ShannonDiversity"), names_to = "Metric", values_to = "Diversity") %>%
group_by(Metric) %>%
do(
t.test(Diversity~Group, data=.) %>%
broom::tidy()
) %>%
knitr::kable()| Metric | estimate | estimate1 | estimate2 | statistic | p.value | parameter | conf.low | conf.high | method | alternative |
|---|---|---|---|---|---|---|---|---|---|---|
| ASV_Richness | 60.3993168 | 289.574074 | 229.174757 | 4.536983 | 0.0000096 | 207.6511 | 34.1540188 | 86.6446148 | Welch Two Sample t-test | two.sided |
| PhylogeneticDiversity | 2.5903518 | 16.557070 | 13.966718 | 4.959897 | 0.0000015 | 208.9131 | 1.5607783 | 3.6199253 | Welch Two Sample t-test | two.sided |
| ShannonDiversity | 0.1950222 | 4.116715 | 3.921693 | 3.019831 | 0.0028451 | 208.4917 | 0.0677077 | 0.3223367 | Welch Two Sample t-test | two.sided |
adiversity %>%
pivot_longer(c("PhylogeneticDiversity","ASV_Richness","ShannonDiversity"), names_to = "Metric", values_to = "Diversity") %>%
group_by(Metric) %>%
do(
wilcox.test(Diversity~Group, data=.) %>%
broom::tidy()
) %>%
knitr::kable()| Metric | statistic | p.value | method | alternative |
|---|---|---|---|---|
| ASV_Richness | 7447.5 | 0.0000212 | Wilcoxon rank sum test with continuity correction | two.sided |
| PhylogeneticDiversity | 7557.0 | 0.0000068 | Wilcoxon rank sum test with continuity correction | two.sided |
| ShannonDiversity | 6892.0 | 0.0027084 | Wilcoxon rank sum test with continuity correction | two.sided |
adiversity %>%
pivot_longer(c("PhylogeneticDiversity","ASV_Richness","ShannonDiversity"), names_to = "Metric", values_to = "Diversity") %>%
group_by(Metric) %>%
do(
glm(Diversity~Group + Age + Sex + `Alcohol Drink`, data=.) %>%
broom::tidy()
) %>%
knitr::kable()| Metric | term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|---|
| ASV_Richness | (Intercept) | 306.7554582 | 26.2328232 | 11.6935740 | 0.0000000 |
| ASV_Richness | GroupDiseased | -56.3860313 | 16.1213214 | -3.4976061 | 0.0005750 |
| ASV_Richness | Age | -0.3643145 | 0.6593569 | -0.5525300 | 0.5811843 |
| ASV_Richness | SexM | -4.5604812 | 15.7309539 | -0.2899049 | 0.7721806 |
| ASV_Richness | Alcohol DrinkYes |
-9.7430378 | 21.9441927 | -0.4439916 | 0.6575147 |
| PhylogeneticDiversity | (Intercept) | 17.6813408 | 1.0249613 | 17.2507394 | 0.0000000 |
| PhylogeneticDiversity | GroupDiseased | -2.1844359 | 0.6298876 | -3.4679771 | 0.0006384 |
| PhylogeneticDiversity | Age | -0.0282538 | 0.0257622 | -1.0967146 | 0.2740466 |
| PhylogeneticDiversity | SexM | -0.2064235 | 0.6146353 | -0.3358472 | 0.7373279 |
| PhylogeneticDiversity | Alcohol DrinkYes |
0.1625281 | 0.8573972 | 0.1895598 | 0.8498407 |
| ShannonDiversity | (Intercept) | 4.2067288 | 0.1259261 | 33.4063259 | 0.0000000 |
| ShannonDiversity | GroupDiseased | -0.2117943 | 0.0773876 | -2.7367985 | 0.0067462 |
| ShannonDiversity | Age | -0.0018969 | 0.0031651 | -0.5993137 | 0.5496223 |
| ShannonDiversity | SexM | 0.0905621 | 0.0755137 | 1.1992807 | 0.2317964 |
| ShannonDiversity | Alcohol DrinkYes |
-0.2137518 | 0.1053393 | -2.0291749 | 0.0437270 |
adiversity %>%
pivot_longer(c("PhylogeneticDiversity","ASV_Richness","ShannonDiversity"), names_to = "Metric", values_to = "Diversity") %>%
filter(Metric=="ASV_Richness") %>%
do(
glm.nb(Diversity~Group + Age + Sex + `Alcohol Drink`, data=.) %>%
broom::tidy()
) %>%
knitr::kable()| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 5.7514750 | 0.1086749 | 52.9236803 | 0.0000000 |
| GroupDiseased | -0.2133090 | 0.0667491 | -3.1956840 | 0.0013950 |
| Age | -0.0017937 | 0.0027327 | -0.6563806 | 0.5115793 |
| SexM | -0.0284362 | 0.0651401 | -0.4365396 | 0.6624453 |
Alcohol DrinkYes |
-0.0266784 | 0.0908267 | -0.2937282 | 0.7689656 |
genus_sum<-summarize_taxa(asv_table, asv_taxonomy %>% column_to_rownames("ASV"))$Genus
dists<-list()
dists$`CLR Euclidean`<-genus_sum %>% make_clr() %>% t() %>% dist(., method="euclidean")
pcos<-lapply(dists, ape::pcoa)
lapply(names(pcos), function(x){
pcos[[x]]$values %>%
as.data.frame() %>%
mutate(Metric=x) %>%
mutate(Axis=1:nrow(.))
}) %>%
bind_rows() %>%
filter(Axis %in% 1:3) %>%
dplyr::select(Metric, Axis, everything()) %>%
interactive_table() pcos$`CLR Euclidean`$vectors %>%
as.data.frame() %>%
rownames_to_column("SampleID") %>%
left_join(metadata) %>%
ggplot(aes(x=Axis.1, y=Axis.2, fill=Group)) +
geom_point(shape=21, color="black") +
theme(legend.position="bottom") +
xlab(paste0("PC1: ",round(pcos$`CLR Euclidean`$values$Relative_eig[1]*100,2),"%")) +
ylab(paste0("PC2: ",round(pcos$`CLR Euclidean`$values$Relative_eig[2]*100,2),"%")) +
scale_fill_manual(values=c("cornflowerblue","indianred")) ggsave("manuscript_figures/cancer_pcoa_aitchison.pdf", height=2.2, width=2)
vegan::adonis2(dists$`CLR Euclidean`~Group + Age + Sex + `Alcohol Drink`, data=metadata[match(labels(dists$`CLR Euclidean`), metadata$SampleID),], by="terms") %>% print()## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## vegan::adonis2(formula = dists$`CLR Euclidean` ~ Group + Age + Sex + `Alcohol Drink`, data = metadata[match(labels(dists$`CLR Euclidean`), metadata$SampleID), ], by = "terms")
## Df SumOfSqs R2 F Pr(>F)
## Group 1 13037 0.04582 10.0316 0.001 ***
## Age 1 1261 0.00443 0.9699 0.405
## Sex 1 1394 0.00490 1.0725 0.288
## `Alcohol Drink` 1 1098 0.00386 0.8447 0.594
## Residual 206 267725 0.94099
## Total 210 284515 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Metric="CLR Euclidean"
gapstats<-clusGap(pcos[[Metric]]$vectors, FUN=do_pam, K.max = 10, B=100)
gapstats<-
gapstats$Tab %>%
as.data.frame() %>%
mutate(Nclusters=1:nrow(.)) %>%
dplyr::select(Nclusters, GapStatistic=gap, SE=SE.sim)
ggplot(gapstats, aes(x=Nclusters, y=GapStatistic, ymin=GapStatistic-SE, ymax=GapStatistic+SE)) +
geom_errorbar(width=0) +
geom_point() +
scale_x_continuous(breaks=(1:10))clusters<-pam(dists[[Metric]], k=2, diss=TRUE, cluster.only = TRUE)
clustered_metadata_all<-tibble(SampleID=names(clusters), Cluster_all=paste0("Cluster_", clusters))
clustered_metadata %>% left_join(clustered_metadata_all) %>% interactive_table() #they are flipped relative to previous assignments to 1 vs 2, will re-arranged.clustered_metadata_all<-clustered_metadata_all %>% mutate(Cluster=if_else(Cluster_all=="Cluster_1", "Cluster_2","Cluster_1"))
metadata %>% left_join(clustered_metadata_all) %>%
ggplot(aes(x=Group, fill=Cluster)) +
geom_bar(color="black") +
scale_fill_manual(values=c("darkorange","purple4")) +
ylab("# Participants") ggsave("manuscript_figures/cancer_clusters.pdf", height=2, width=2)
metadata %>% left_join(clustered_metadata_all) %>%
group_by(Group, Cluster) %>%
summarize(N=n()) %>%
pivot_wider(names_from = "Cluster", values_from = "N") %>%
knitr::kable()| Group | Cluster_1 | Cluster_2 |
|---|---|---|
| Healthy | 70 | 38 |
| Diseased | 37 | 66 |
metadata %>% left_join(clustered_metadata_all) %>%
group_by(Group, Cluster) %>%
summarize(N=n()) %>%
pivot_wider(names_from = "Cluster", values_from = "N") %>%
dplyr::select(1,3,2) %>%
as.data.frame() %>%
column_to_rownames("Group") %>%
.[c(2,1),] %>% #flip order to change how OR is described
fisher.test()##
## Fisher's Exact Test for Count Data
##
## data: .
## p-value = 3.36e-05
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 1.801207 6.009433
## sample estimates:
## odds ratio
## 3.266217
Now will confirm with multivariable logistic regression
fit<-
metadata %>% left_join(clustered_metadata_all) %>%
mutate(DiseaseBin=if_else(Group=="Diseased", 1, 0)) %>%
glm(DiseaseBin~ Cluster + Age + Sex + `Alcohol Drink`, family=binomial, data=.)
summary(fit) ##
## Call:
## glm(formula = DiseaseBin ~ Cluster + Age + Sex + `Alcohol Drink`,
## family = binomial, data = .)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.70210 1.03267 -6.490 8.58e-11 ***
## ClusterCluster_2 1.06720 0.35347 3.019 0.00253 **
## Age 0.14581 0.02426 6.010 1.86e-09 ***
## SexM 0.20069 0.42789 0.469 0.63905
## `Alcohol Drink`Yes -2.61517 0.65653 -3.983 6.80e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 292.39 on 210 degrees of freedom
## Residual deviance: 197.33 on 206 degrees of freedom
## AIC: 207.33
##
## Number of Fisher Scoring iterations: 5
| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | 0.0012283 | 1.0326684 | -6.4900813 | 0.0000000 | 0.0001390 | 0.0080986 |
| ClusterCluster_2 | 2.9072295 | 0.3534698 | 3.0192132 | 0.0025343 | 1.4671815 | 5.8979473 |
| Age | 1.1569814 | 0.0242626 | 6.0098498 | 0.0000000 | 1.1069684 | 1.2178495 |
| SexM | 1.2222476 | 0.4278900 | 0.4690259 | 0.6390511 | 0.5252974 | 2.8410213 |
Alcohol DrinkYes |
0.0731554 | 0.6565310 | -3.9833142 | 0.0000680 | 0.0183859 | 0.2464655 |
cancer_asvs<-asv_table[,subset(metadata, Group=="Diseased")$SampleID] %>% filter_features(1,1)
adiversity<-
cancer_asvs %>% subsample_table() %>% t() %>% picante::pd(., asv_tree) %>%
rownames_to_column("SampleID") %>%
dplyr::select(SampleID, PhylogeneticDiversity="PD", ASV_Richness="SR") %>%
full_join(
cancer_asvs %>% subsample_table() %>% t() %>% diversity(., "shannon") %>% data.frame(ShannonDiversity=.) %>% rownames_to_column("SampleID")
) %>%
left_join(metadata)
interactive_table(adiversity)adiversity %>%
pivot_longer(c("PhylogeneticDiversity","ASV_Richness","ShannonDiversity"), names_to = "Metric", values_to = "Diversity") %>%
group_by(Metric, Histology) %>%
summarize_at("Diversity", lst(mean,median,sd,min,max)) %>%
interactive_table()adiversity %>%
pivot_longer(c("PhylogeneticDiversity","ASV_Richness","ShannonDiversity"), names_to = "Metric", values_to = "Diversity") %>%
ggplot(aes(x=Histology, y=Diversity, fill=Histology)) +
geom_boxplot() +
facet_wrap(~Metric, scales="free") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
theme(legend.position="none") +
xlab("") +
ylab("")ggsave("manuscript_figures/histology_alpha_diversity.pdf", height=2, width=3, useDingbats=F)
adiversity %>%
pivot_longer(c("PhylogeneticDiversity","ASV_Richness","ShannonDiversity"), names_to = "Metric", values_to = "Diversity") %>%
group_by(Metric) %>%
do(
t.test(Diversity~Histology, data=.) %>%
broom::tidy()
) %>%
knitr::kable()| Metric | estimate | estimate1 | estimate2 | statistic | p.value | parameter | conf.low | conf.high | method | alternative |
|---|---|---|---|---|---|---|---|---|---|---|
| ASV_Richness | -6.5387931 | 224.875000 | 231.413793 | -0.2675944 | 0.7916005 | 21.17681 | -57.3292898 | 44.251704 | Welch Two Sample t-test | two.sided |
| PhylogeneticDiversity | -0.1683577 | 13.923964 | 14.092322 | -0.1634073 | 0.8717653 | 20.93045 | -2.3114069 | 1.974691 | Welch Two Sample t-test | two.sided |
| ShannonDiversity | -0.0122405 | 3.912341 | 3.924581 | -0.1023655 | 0.9194256 | 21.26808 | -0.2607219 | 0.236241 | Welch Two Sample t-test | two.sided |
adiversity %>%
pivot_longer(c("PhylogeneticDiversity","ASV_Richness","ShannonDiversity"), names_to = "Metric", values_to = "Diversity") %>%
group_by(Metric) %>%
do(
wilcox.test(Diversity~Histology, data=.) %>%
broom::tidy()
) %>%
knitr::kable()| Metric | statistic | p.value | method | alternative |
|---|---|---|---|---|
| ASV_Richness | 680.5 | 0.8913695 | Wilcoxon rank sum test with continuity correction | two.sided |
| PhylogeneticDiversity | 688.0 | 0.9455600 | Wilcoxon rank sum test with continuity correction | two.sided |
| ShannonDiversity | 680.0 | 0.8877758 | Wilcoxon rank sum test with continuity correction | two.sided |
adiversity %>%
pivot_longer(c("PhylogeneticDiversity","ASV_Richness","ShannonDiversity"), names_to = "Metric", values_to = "Diversity") %>%
group_by(Metric) %>%
do(
glm(Diversity~Histology + Age + Sex + `Alcohol Drink`, data=.) %>%
broom::tidy()
) %>%
knitr::kable()| Metric | term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|---|
| ASV_Richness | (Intercept) | 280.7962636 | 42.3845654 | 6.6249650 | 0.0000000 |
| ASV_Richness | HistologyESCC | 0.2271049 | 24.7288868 | 0.0091838 | 0.9926912 |
| ASV_Richness | Age | -0.7582168 | 0.6805618 | -1.1141043 | 0.2679594 |
| ASV_Richness | SexM | -30.3218279 | 19.3841082 | -1.5642622 | 0.1209795 |
| ASV_Richness | Alcohol DrinkYes |
14.3069739 | 38.8977330 | 0.3678100 | 0.7138081 |
| PhylogeneticDiversity | (Intercept) | 17.3448585 | 1.7359790 | 9.9913987 | 0.0000000 |
| PhylogeneticDiversity | HistologyESCC | -0.1751294 | 1.0128411 | -0.1729091 | 0.8630795 |
| PhylogeneticDiversity | Age | -0.0517987 | 0.0278743 | -1.8582938 | 0.0661289 |
| PhylogeneticDiversity | SexM | -1.2657805 | 0.7939306 | -1.5943212 | 0.1140843 |
| PhylogeneticDiversity | Alcohol DrinkYes |
0.9229716 | 1.5931660 | 0.5793317 | 0.5636947 |
| ShannonDiversity | (Intercept) | 3.9630797 | 0.2134718 | 18.5648865 | 0.0000000 |
| ShannonDiversity | HistologyESCC | 0.0060704 | 0.1245482 | 0.0487392 | 0.9612263 |
| ShannonDiversity | Age | -0.0005871 | 0.0034277 | -0.1712953 | 0.8643448 |
| ShannonDiversity | SexM | -0.0334025 | 0.0976289 | -0.3421375 | 0.7329801 |
| ShannonDiversity | Alcohol DrinkYes |
-0.0284999 | 0.1959102 | -0.1454745 | 0.8846350 |
adiversity %>%
pivot_longer(c("PhylogeneticDiversity","ASV_Richness","ShannonDiversity"), names_to = "Metric", values_to = "Diversity") %>%
filter(Metric=="ASV_Richness") %>%
do(
glm.nb(Diversity~Histology + Age + Sex + `Alcohol Drink`, data=.) %>%
broom::tidy()
) %>%
knitr::kable()| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 5.6414444 | 0.1978516 | 28.5135157 | 0.0000000 |
| HistologyESCC | 0.0028321 | 0.1154336 | 0.0245346 | 0.9804262 |
| Age | -0.0030589 | 0.0031781 | -0.9624956 | 0.3358007 |
| SexM | -0.1317515 | 0.0904872 | -1.4560238 | 0.1453860 |
Alcohol DrinkYes |
0.0599459 | 0.1815510 | 0.3301875 | 0.7412583 |
genus_sum<-summarize_taxa(cancer_asvs, asv_taxonomy %>% column_to_rownames("ASV"))$Genus
dists<-list()
dists$`CLR Euclidean`<-genus_sum %>% make_clr() %>% t() %>% dist(., method="euclidean")
pcos<-lapply(dists, ape::pcoa)
pcos$`CLR Euclidean`$vectors %>%
as.data.frame() %>%
rownames_to_column("SampleID") %>%
left_join(metadata) %>%
ggplot(aes(x=Axis.1, y=Axis.2, fill=Histology)) +
geom_point(shape=21, color="black") +
theme(legend.position="bottom") +
xlab(paste0("PC1: ",round(pcos$`CLR Euclidean`$values$Relative_eig[1]*100,2),"%")) +
ylab(paste0("PC2: ",round(pcos$`CLR Euclidean`$values$Relative_eig[2]*100,2),"%")) ggsave("manuscript_figures/histology_pcoa_aitchison.pdf", height=2.2, width=2)
vegan::adonis2(dists$`CLR Euclidean`~Histology + Age + Sex + `Alcohol Drink`, data=metadata[match(labels(dists$`CLR Euclidean`), metadata$SampleID),])## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
##
## vegan::adonis2(formula = dists$`CLR Euclidean` ~ Histology + Age + Sex + `Alcohol Drink`, data = metadata[match(labels(dists$`CLR Euclidean`), metadata$SampleID), ])
## Df SumOfSqs R2 F Pr(>F)
## Model 4 6121 0.04581 1.1763 0.141
## Residual 98 127478 0.95419
## Total 102 133599 1.00000
genus_sums<-summarize_taxa(asv_table, asv_taxonomy %>% column_to_rownames("ASV"))$Species # disregard variable name of genus, species abundances were carried forward for greater specificity
genus_sums<-genus_sums[,metadata$SampleID] %>% filter_features(20,20) # must be in at least 20 samples
clr<-aldex.clr(genus_sums, model.matrix(~Group + Sex + Age + `Alcohol Drink`, metadata), mc.samples = 128, denom = "all")
aldex_results<-aldex.glm(clr, fdr.method="BH")
aldex_results %>%
rownames_to_column("Taxon") %>%
dplyr::select(Taxon, Group_Estimate=`GroupDiseased:Est`, Group_Pvalue=`GroupDiseased:pval`, Group_FDR=`GroupDiseased:pval.padj`) %>%
filter(Group_FDR<0.1) %>%
arrange(desc(Group_Estimate)) %>%
interactive_table()treedat<-
aldex_results %>%
rownames_to_column("Taxon") %>%
dplyr::select(Taxon, Group_Estimate=`GroupDiseased:Est`, Group_SE=`GroupDiseased:SE`, Group_Pvalue=`GroupDiseased:pval`, Group_FDR=`GroupDiseased:pval.padj`) %>%
filter(Group_FDR<0.1) %>%
mutate(Taxon=paste0("root; ", Taxon)) %>%
separate(Taxon, c("Root","Kingdom","Phylum","Class","Order","Family","Genus","Species"), sep="; ", remove = F) %>%
mutate(Species=paste0("S",1:nrow(.),":",Genus," ", Species)) %>%
mutate_if(is.character, function(x) if_else(x=="NA", "Not Assigned", x)) %>% #replace empty character columns with Not assigned
mutate(pathString=paste(Root, Kingdom, Phylum, Class, Order, Family, Genus, Species, sep="/")) %>% #create a new column pathString by concatenating values from Kingdom, Phylum, Class, Order, Family, and Genus columns, separated by "/".
arrange(pathString)
clado <-
treedat %>%
dplyr::select(pathString, Taxon) %>%
data.tree::as.Node() %>%
data.tree::as.phylo.Node()
plottree<-ggtree( clado, layout="circular") %<+% (treedat %>% mutate(Snum=gsub(":..+","", Species)) %>% dplyr::select(Snum, everything())) +
geom_tippoint(aes(fill=Group_Estimate, size=-log10(Group_FDR)), shape=21) +
geom_tiplab(aes(label=gsub("NA", "sp.", Species) %>% gsub("S[0-9]+:","",.)), size=2, offset=0.7) +
scale_fill_gradient2(low="cornflowerblue",high="darkred") +
theme(legend.position="bottom")
interest<-c("Clostridia","Veillonellales-Selenomonadales","Lactobacillales","Actinobacteriota","Bacteroidota","Gammaproteobacteria")
for( i in interest){
message(i)
tips<-treedat %>% filter(grepl(i, pathString)) %>% pull(Species) %>% gsub(":..+","", .)
nd<-getMRCA(phy=clado, tip=as.character(tips))
plottree<-
plottree +
#geom_hilight(
#node=nd,
#fill="grey50",
#alpha=0.2
#) +
geom_cladelabel(node=nd, label=i, offset = 10)
}
plottreepicrust<-read_tsv("picrust2_out_strat/pathways_w_desc.tsv")
picrust<-picrust[,c("pathway","description", colnames(asv_table))]
picrust_results<-
picrust %>%
pivot_longer(!c(pathway, description), names_to = "SampleID", values_to = "Abundance") %>%
group_by(SampleID) %>%
mutate(Abundance=(Abundance/sum(Abundance))) %>% # convert to proportion... shouldn't be necessary as already subsampled.
group_by(pathway) %>%
mutate(Abundance=log2(Abundance+(min_nonzero(Abundance)*(2/3)))) %>%
ungroup() %>%
left_join(metadata) %>%
group_by(pathway, description) %>%
do(
glm(Abundance~Group + Sex + Age + `Alcohol Drink`, data=.) %>%
broom::tidy()
) %>%
group_by(term) %>%
mutate(FDR=p.adjust(p.value, method="BH")) %>%
ungroup()
picrust_results %>%
filter(term!="(Intercept)" & FDR<0.1) %>%
interactive_table() # no significant confoundersWill now extract the taxa driving these changes. Will only extract the most abundant taxa driving
sigpath<-
picrust_results %>%
filter(term=="GroupDiseased" & FDR<0.1) %>%
pull(pathway)
pathstrat<-read_tsv("picrust2_out_strat/pathways_out/path_abun_contrib.tsv.gz") %>% dplyr::rename(Pathway=`function`) %>% filter(Pathway %in% sigpath)
contribs<-
pathstrat %>%
dplyr::rename(ASV=taxon) %>%
left_join(asv_taxonomy) %>%
group_by(Genus, Pathway) %>%
summarize(mean=mean(taxon_rel_function_abun)) %>%
group_by(Pathway) %>%
top_n(5, mean) %>%
arrange(desc(mean)) %>%
summarize(Genera=paste(Genus, collapse = ", "))
picrust_results %>%
filter(term!="(Intercept)" & FDR<0.1) %>%
left_join(contribs %>% dplyr::rename(pathway=Pathway)) %>%
interactive_table() # no significant confounderslineage<-read_tsv("/data/shared_resources/databases/metacyc/25.1/map_metacyc-pwy_lineage.tsv", col_names=c("ID","Lineage")) #dug up from https://forum.biobakery.org/t/metacyc-hierarchy-to-invetigate-identify-specific-pathways/1830/4
plot_results<-picrust_results %>%filter(term=="GroupDiseased" & FDR<0.1)
#gplots::venn(list(meta=lineage$ID, pathways=plot_results$pathway))
#gplots::venn(list(meta=lineage$ID, pathways=plot_results$pathway), show.plot = F) %>% print() #3 pathways missing from annotation
#attr(,"intersections")$pathways
#[1] "GLYCOLYSIS-TCA-GLYOX-BYPASS" "GLYOXYLATE-BYPASS" "TCA-GLYOX-BYPASS"
#These are superpathways being excluded
lineage<-lineage %>% filter(ID %in% plot_results$pathway & Lineage!="Super-Pathways")
lineage<-lineage %>% mutate(Lineage=paste0("Origin|", Lineage,"|", ID))
tree<-
lineage %>%
filter(!duplicated(ID)) %>% # remove duplicates
data.tree::FromDataFrameTable(., pathName="Lineage", pathDelimiter = "|")
tree<-igraph::as.igraph(tree)
lout<-ggraph::create_layout(tree, layout="igraph", circular=TRUE, algorithm="tree")
lout<-lout %>% left_join(plot_results %>% dplyr::rename(name=pathway) %>% mutate(Annotation=paste(name, description, sep=": ")))
lout$Annotation=if_else(is.na(lout$Annotation), lout$name, lout$Annotation)
lout$Annotation<-gsub("−","-", lout$Annotation)
ggraph::ggraph(lout) +
ggraph::geom_edge_diagonal() +
ggraph::geom_node_point(aes(fill=estimate, size=-log10(p.value)), shape=21) +
ggraph::geom_node_text(aes(label=Annotation), size=2) +
scale_fill_gradient2(low="darkblue",high="darkred") +
theme_void() +
theme(legend.position="bottom")Validation cohorts: V3-V4: Chen PRJNA961904 https://bmcmicrobiol.biomedcentral.com/articles/10.1186/s12866-024-03233-4 31 patients and 21 healthy controls, China. (early-stage intramucosal esophageal squamous carcinoma). 52 are saliva. Sample names and numbers reveal metadata. 31 SAL must be saliva cancer, and the 21 SALNC must be no cancer. V3-V4: Jiang PRJNA853196 https://link.springer.com/article/10.1007/s00432-022-04393-4 oral swabs, control = 53, ESCC = 56. metadata available
Given issues with denoising NovaSeq data and the larger amplicon V3-V4 having problematic features for overlap, especially from Novaseq, will carry forward reverse read only.
mkdir external
mkdir external/PRJNA961904
mkdir external/PRJNA853196
export PATH=$PATH:/data/shared_resources/sftwr/sratoolkit.3.0.0-ubuntu64/bin
#vdb-config -i # caching to ~/.prefetch_cache/
while read line; do
SRR=$(echo $line | cut -d "," -f 1)
PRJ=$(echo $line | cut -d "," -f 5)
SAMN=$(echo $line | cut -d "," -f 6)
echo $(date): $SRR from $SAMN from $PRJ
mkdir -p external/$PRJ/$SAMN
fasterq-dump --outdir external/$PRJ/$SAMN --threads 6 --mem 32G $SRR
if [ -f "external/$PRJ/$SAMN/${SRR}_1.fastq" ]; then echo "...Success"; else echo "...FAILURE"; fi
done < external/PRJNA961904_SraRunTable_filt.csv
while read line; do
SRR=$(echo $line | cut -d "," -f 1)
PRJ=$(echo $line | cut -d "," -f 5)
SAMN=$(echo $line | cut -d "," -f 6)
echo $(date): $SRR from $SAMN from $PRJ
mkdir -p external/$PRJ/$SAMN
fasterq-dump --outdir external/$PRJ/$SAMN --threads 6 --mem 32G $SRR
if [ -f "external/$PRJ/$SAMN/${SRR}_1.fastq" ]; then echo "...Success"; else echo "...FAILURE"; fi
done < external/PRJNA853196_SraRunTable_filt.csvPCR amplification
16S rDNA genes of V3–V4 region were amplified using universal primers,
namely 338F (5′-ACTCCTACGGGAGGCAGCAG-3′) and 806R (5′ -GGACTACHVGGGTWTCTAAT-3′).
All PCR reactions (including denaturation, annealing, and elongation) were carried out with
Phusion® High-Fidelity PCR Master Mix (New England Biolabs).
After electrophoresis of PCR products, samples with bright main strip between
400 and 450 bp were chosen for the next mixing and purification with
Qiagen Gel Extraction Kit (Qiagen, Germany).
Sequencing processing and analysis
Purified amplicons were pooled in equimolar and paired-end sequenced on an Illumina Novaseq6000 PE250
platform (Illumina, San Diego, USA). The raw data were filtered with QIIME (v 1.8.0), discarding the
reads which were de-replicated or shorter than 150 bp. Filtered reads were clustered into OTUs with the
assumed 97% similarity. Compared with the SILVA database (version 128), the species classification
information of each OTU was obtained. All raw reads were stored in NCBI Sequence Read Archive (SRA) database,
and the accession number is PRJNA853196.
library(tidyverse)
library(dada2)
library(Biostrings)
dir.create("external/PRJNA853196/filtered")
prj_meta<-
read_csv("external/PRJNA853196_SraRunTable_filt.csv") %>% dplyr::select(SampleID=BioSample, BioProject, Run, Group) %>%
mutate(R1=paste0("external/PRJNA853196/", SampleID,"/", Run,"_1.fastq")) %>%
mutate(R2=paste0("external/PRJNA853196/", SampleID,"/", Run,"_2.fastq")) %>%
mutate(R1_filt=gsub("SAMN[0-9]+", "filtered", R1)) %>%
mutate(R2_filt=gsub("SAMN[0-9]+", "filtered", R2))
for(i in prj_meta$R1){message(file.exists(i))}
for(i in prj_meta$R2){message(file.exists(i))}
plotQualityProfile(prj_meta$R1[1:3])
plotQualityProfile(prj_meta$R2[1:3])
Biostrings::readDNAStringSet(prj_meta$R1[1], format="fastq") # 220 x 220, primers appear to have been removed
Biostrings::readDNAStringSet(prj_meta$R2[1], format="fastq")
dmp<-
filterAndTrim(
fwd=prj_meta$R1,
filt=prj_meta$R1_filt,
rev=prj_meta$R2,
filt.rev=prj_meta$R2_filt,
compress=FALSE,
trimLeft = c(0,0), # primer lengths as the primer seqs are still intact in read
truncLen = c(200,200),
multithread = 16,
rm.phix=TRUE
)
errF <- learnErrors(prj_meta$R1_filt, multithread=16)
errR <- learnErrors(prj_meta$R2_filt, multithread=16)
dadaFs <- dada(prj_meta$R1_filt, err=errF, multithread=16)
dadaRs <- dada(prj_meta$R2_filt, err=errR, multithread=16)
mergers <- mergePairs(dadaFs, prj_meta$R1_filt, dadaRs, prj_meta$R2_filt, verbose=TRUE)
#seqtab <- makeSequenceTable(mergers)
#dim(seqtab)
#sum(seqtab)
seqtab <- makeSequenceTable(dadaRs)
dim(seqtab)
sum(seqtab)
seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=16, verbose=TRUE)
sum(seqtab.nochim)/sum(seqtab)
asv_table<-t(seqtab.nochim) %>% qiime2R::filter_features(2,2)
asv_taxonomy<-tibble(ASV=paste0("ASV_", 1:nrow(asv_table)), Sequence=rownames(asv_table))
asv_taxonomy$Sequence<-asv_taxonomy$Sequence %>% DNAStringSet() %>% reverseComplement() %>% as.character()
taxonomy <- assignTaxonomy(asv_taxonomy$Sequence, "/data/shared_resources/databases/Q2_2023.5_db/silva_nr99_v138.1_wSpecies_train_set.fa.gz", multithread=24)
taxonomy <- addSpecies(taxonomy, "/data/shared_resources/databases/Q2_2023.5_db/silva_species_assignment_v138.1.fa.gz", allowMultiple=TRUE)
asv_taxonomy<-asv_taxonomy %>% left_join(taxonomy %>% as.data.frame() %>% dplyr::rename(Species_Ambiguous=8) %>% rownames_to_column("Sequence"))
rownames(asv_table)<-asv_taxonomy$ASV
colnames(asv_table)<-gsub("_[12]\\.fastq","", colnames(asv_table))
write_tsv(as.data.frame(asv_table) %>% rownames_to_column("ASV"), "external/PRJNA853196/PRJNA853196_asvtable.tsv")
write_tsv(asv_taxonomy, "external/PRJNA853196/PRJNA853196_asvtaxonomy.tsv")Microbial genomic DNA was extracted using the CTAB Method and stored at -20 °C.
The V3-V4 hypervariable regions, approximately 468 bp in length, were amplified by polymerase chain reaction
(Primers: 338 F (5′-ACTCCTACGGGAGGCAGCA-3′) and 806R (5′-GGACTACHVGGGTWTCTAAT-3′).
PCR samples were quantified using the Quant-it PicoGreen dsDNA Assay Kit on a Microplate reader (BioTek, FLx800) and
mixed based on the amount of data for each sample. DNA library construction was performed using the TruSeq Nano DNA LT
Library Prep Kit (Illumina Scientific Co., Ltd). The quantity and quality of each library were measured using the Quant-iT
PicoGreen dsDNA Assay Kit and Agilent High Sensitivity DNA Kit, respectively. For qualified libraries, the NovaSeq 6000 SP Reagent Kit (500 cycles)
on the Illumina NovaSeq machine was used for 2 × 250 bp double-ended sequencing.
Raw data from the 16s rRNA sequence was processed by Personalbio Technology Co., Ltd. (Shanghai, China).
Amplicon sequence variants (ASVs) were integrated into merged taxonomic abundance and taxonomy classification tables.
All amplicon raw data have been submitted to the Sequence Read Archive (SRA) in NCBI (Archive number: PRJNA961904).
library(tidyverse)
library(dada2)
library(Biostrings)
dir.create("external/PRJNA961904/filtered")
prj_meta<-
read_csv("external/PRJNA961904_SraRunTable_filt.csv") %>% dplyr::select(SampleID=BioSample, BioProject, Run, Group) %>%
mutate(R1=paste0("external/PRJNA961904/", SampleID,"/", Run,"_1.fastq")) %>%
mutate(R2=paste0("external/PRJNA961904/", SampleID,"/", Run,"_2.fastq")) %>%
mutate(R1_filt=gsub("SAMN[0-9]+", "filtered", R1)) %>%
mutate(R2_filt=gsub("SAMN[0-9]+", "filtered", R2))
for(i in prj_meta$R1){message(file.exists(i))}
for(i in prj_meta$R2){message(file.exists(i))}
plotQualityProfile(prj_meta$R1[1:3])
plotQualityProfile(prj_meta$R2[1:3])
Biostrings::readDNAStringSet(prj_meta$R1[1], format="fastq") # primers present and 251 x 243
Biostrings::readDNAStringSet(prj_meta$R2[1], format="fastq")
dmp<-
filterAndTrim(
fwd=prj_meta$R1,
filt=prj_meta$R1_filt,
rev=prj_meta$R2,
filt.rev=prj_meta$R2_filt,
compress=FALSE,
trimLeft = c(19,20), # primer lengths as the primer seqs are still intact in read
truncLen = c(210,200),
multithread = 16,
rm.phix=TRUE
)
errF <- learnErrors(prj_meta$R1_filt, multithread=16)
errR <- learnErrors(prj_meta$R2_filt, multithread=16)
dadaFs <- dada(prj_meta$R1_filt, err=errF, multithread=16)
dadaRs <- dada(prj_meta$R2_filt, err=errR, multithread=16)
mergers <- mergePairs(dadaFs, prj_meta$R1_filt, dadaRs, prj_meta$R2_filt, verbose=TRUE)
#seqtab <- makeSequenceTable(mergers)
#dim(seqtab)
#sum(seqtab)
seqtab <- makeSequenceTable(dadaRs)
dim(seqtab)
sum(seqtab)
seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=16, verbose=TRUE)
sum(seqtab.nochim)/sum(seqtab)
asv_table<-t(seqtab.nochim) %>% qiime2R::filter_features(2,2)
asv_taxonomy<-tibble(ASV=paste0("ASV_", 1:nrow(asv_table)), Sequence=rownames(asv_table))
asv_taxonomy$Sequence<-asv_taxonomy$Sequence %>% DNAStringSet() %>% reverseComplement() %>% as.character()
taxonomy <- assignTaxonomy(asv_taxonomy$Sequence, "/data/shared_resources/databases/Q2_2023.5_db/silva_nr99_v138.1_wSpecies_train_set.fa.gz", multithread=24)
taxonomy <- addSpecies(taxonomy, "/data/shared_resources/databases/Q2_2023.5_db/silva_species_assignment_v138.1.fa.gz", allowMultiple=TRUE)
asv_taxonomy<-asv_taxonomy %>% left_join(taxonomy %>% as.data.frame() %>% dplyr::rename(Species_Ambiguous=8) %>% rownames_to_column("Sequence"))
rownames(asv_table)<-asv_taxonomy$ASV
colnames(asv_table)<-gsub("_[12]\\.fastq","", colnames(asv_table))
write_tsv(as.data.frame(asv_table) %>% rownames_to_column("ASV"), "external/PRJNA961904/PRJNA961904_asvtable.tsv")
write_tsv(asv_taxonomy, "external/PRJNA961904/PRJNA961904_asvtaxonomy.tsv")mulisa<-list()
mulisa$meta<-metadata %>% dplyr::select(SampleID, Group) %>% mutate(Dataset="Mulisa") %>% mutate(Group=if_else(Group=="Diseased", "Case","Control"))
mulisa$asv<-asv_table
mulisa$tax<-asv_taxonomy
mulisa$genus<-summarize_taxa(mulisa$asv, mulisa$tax %>% column_to_rownames("ASV"))$Genus
mulisa$family<-summarize_taxa(mulisa$asv, mulisa$tax %>% column_to_rownames("ASV"))$Family
dim(mulisa$genus)## [1] 486 211
jiang<-list()
jiang$meta<-read_csv("external/PRJNA853196_SraRunTable_filt.csv") %>% dplyr::select(SampleID=Run, Group) %>% mutate(Dataset="Jiang")
jiang$asv<-read_tsv("external/PRJNA853196/PRJNA853196_asvtable.tsv") %>% column_to_rownames("ASV")
jiang$tax<-read_tsv("external/PRJNA853196/PRJNA853196_asvtaxonomy.tsv")
jiang$genus<-summarize_taxa(jiang$asv, jiang$tax %>% column_to_rownames("ASV"))$Genus
jiang$family<-summarize_taxa(jiang$asv, jiang$tax %>% column_to_rownames("ASV"))$Family
dim(jiang$genus)## [1] 532 109
chen<-list()
chen$meta<-read_csv("external/PRJNA961904_SraRunTable_filt.csv") %>% dplyr::select(SampleID=Run, Group) %>% mutate(Dataset="Chen")
chen$asv<-read_tsv("external/PRJNA961904/PRJNA961904_asvtable.tsv") %>% column_to_rownames("ASV")
chen$tax<-read_tsv("external/PRJNA961904/PRJNA961904_asvtaxonomy.tsv")
chen$genus<-summarize_taxa(chen$asv, chen$tax %>% column_to_rownames("ASV"))$Genus
chen$family<-summarize_taxa(chen$asv, chen$tax %>% column_to_rownames("ASV"))$Family
dim(chen$genus)## [1] 325 52
merged_meta<-bind_rows(mulisa$meta, chen$meta, jiang$meta)
merged_genus<-
bind_rows(
mulisa$genus %>% rownames_to_column("Taxon") %>% pivot_longer(!Taxon, names_to = "SampleID", values_to = "Counts"),
chen$genus %>% rownames_to_column("Taxon") %>% pivot_longer(!Taxon, names_to = "SampleID", values_to = "Counts"),
jiang$genus %>% rownames_to_column("Taxon") %>% pivot_longer(!Taxon, names_to = "SampleID", values_to = "Counts")
) %>%
pivot_wider(names_from = SampleID, values_from = Counts, values_fill = 0) %>%
column_to_rownames("Taxon")
pc<-
make_clr(merged_genus) %>%
t() %>%
dist(., method="euclidean") %>%
pcoa()
pc$vectors %>%
as.data.frame() %>%
rownames_to_column("SampleID") %>%
left_join(merged_meta) %>%
ggplot(aes(x=Axis.1, y=Axis.2, color=Dataset, shape=Group)) +
geom_point(alpha=0.5) +
xlab(paste0("PC1: ",round(pc$values$Relative_eig[1]*100,2),"%")) +
ylab(paste0("PC2: ",round(pc$values$Relative_eig[2]*100,2),"%")) +
scale_shape_manual(values=c(16, 1))ggsave("manuscript_figures/pco_cross_comparisons.pdf", height=2, width=3, useDingbats=F)
merged_dist<-
make_clr(merged_genus) %>%
t() %>%
dist(., method="euclidean")
adonis2(merged_dist~Group*Dataset, data=merged_meta[match(labels(merged_dist), merged_meta$SampleID),], parallel=12)## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = merged_dist ~ Group * Dataset, data = merged_meta[match(labels(merged_dist), merged_meta$SampleID), ], parallel = 12)
## Df SumOfSqs R2 F Pr(>F)
## Model 5 107018 0.1997 18.266 0.001 ***
## Residual 366 428879 0.8003
## Total 371 535897 1.0000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
merged_alpha<-bind_rows(
diversity(chen$asv %>% subsample_table(), "shannon", MARGIN=2) %>% data.frame(Shannon=.) %>% rownames_to_column("SampleID"),
diversity(jiang$asv %>% subsample_table(), "shannon", MARGIN=2) %>% data.frame(Shannon=.) %>% rownames_to_column("SampleID"),
diversity(mulisa$asv %>% subsample_table(), "shannon", MARGIN=2) %>% data.frame(Shannon=.) %>% rownames_to_column("SampleID")
) %>%
left_join(merged_meta)
merged_alpha %>%
group_by(Dataset) %>%
do(
t.test(Shannon~Group, data=.) %>%
broom::tidy()
) %>%
knitr::kable()| Dataset | estimate | estimate1 | estimate2 | statistic | p.value | parameter | conf.low | conf.high | method | alternative |
|---|---|---|---|---|---|---|---|---|---|---|
| Chen | -0.0673069 | 3.422203 | 3.489509 | -0.632158 | 0.5302942 | 47.82611 | -0.2814026 | 0.1467888 | Welch Two Sample t-test | two.sided |
| Jiang | 0.1023905 | 3.356753 | 3.254363 | 1.810364 | 0.0730558 | 106.62091 | -0.0097336 | 0.2145146 | Welch Two Sample t-test | two.sided |
| Mulisa | -0.1950222 | 3.921693 | 4.116715 | -3.019831 | 0.0028451 | 208.49174 | -0.3223367 | -0.0677077 | Welch Two Sample t-test | two.sided |
merged_alpha %>%
mutate(Group=factor(Group, levels=c("Control","Case"))) %>%
ggplot(aes(x=Dataset, y=Shannon, fill=Group)) +
geom_boxplot() +
scale_fill_manual(values=c("cornflowerblue","indianred")) +
xlab("Cohort") +
ylab("Shannon's Diversity Index") +
theme(legend.position="none")merged_family<-
bind_rows(
mulisa$family %>% rownames_to_column("Taxon") %>% pivot_longer(!Taxon, names_to = "SampleID", values_to = "Counts"),
chen$family %>% rownames_to_column("Taxon") %>% pivot_longer(!Taxon, names_to = "SampleID", values_to = "Counts"),
jiang$family%>% rownames_to_column("Taxon") %>% pivot_longer(!Taxon, names_to = "SampleID", values_to = "Counts")
) %>%
pivot_wider(names_from = SampleID, values_from = Counts, values_fill = 0) %>%
column_to_rownames("Taxon")
taxa_barplot(merged_family, merged_meta, category="Dataset") +
theme(axis.text.x = element_blank(), axis.ticks.x=element_blank())library(randomForest)
library(ROCR)
library(pROC)
ROCR_ROCs<-list()
pROC_ROCs<-list()
AUCs<-list()
Importance<-list()
splits<-list()
merged_genus_rf<-merged_genus %>% filter_features(20,20) %>% make_proportion()
for(i in 1:100){
message(i)
set.seed(as.numeric(paste0("202407",i)))
mulisa$meta<-mulisa$meta %>% sample_n(nrow(.)) %>% mutate(Dataset=c(rep("Training", 139), rep("Test", 72)))
splits[[i]]<-mulisa$meta %>% group_by(Group, Dataset) %>% count() %>% pivot_wider(names_from = Group, values_from = n) %>% mutate(Iteration=i)
RFmeta<-bind_rows(mulisa$meta, jiang$meta, chen$meta) %>% mutate(Group=factor(Group))
RFdata<-merged_genus_rf[,RFmeta$SampleID] %>% t()
fit<-randomForest(x=RFdata[subset(RFmeta, Dataset=="Training")$SampleID,],
y=subset(RFmeta, Dataset=="Training")$Group,
importance=TRUE
)
Importance[[i]]<-
fit$importance %>%
as.data.frame() %>%
rownames_to_column("Taxon") %>%
arrange(desc("MeanDecreaseGini")) %>%
mutate(Iteration=i) %>%
dplyr::select(Iteration, Taxon, everything()) %>%
as_tibble()
for(Task in c("Training","Test","Jiang","Chen")){
tm<-predict(fit, newdata=RFdata[subset(RFmeta, Dataset==Task)$SampleID,], type="prob")[,2] %>%
prediction(., subset(RFmeta, Dataset==Task)$Group) %>%
performance(., "tpr","fpr")
ROCR_ROCs[[paste0(Task,"_", i)]]<-
tibble(Iteration=i, Task=Task, FPR=unlist(tm@x.values), TPR=unlist(tm@y.values))
#https://stackoverflow.com/questions/54642823/how-to-get-specific-tpr-from-a-sequence-of-fpr-in-r
pROC_ROCs[[paste0(Task,"_", i)]]<-
predict(fit, newdata=RFdata[subset(RFmeta, Dataset==Task)$SampleID,], type="prob")[,2] %>%
roc(subset(RFmeta, Dataset==Task)$Group, .) %>%
coords(., x=1-seq(0,1,0.02), input="specificity", ret = c("se", "sp")) %>%
tibble(Iteration=i, Task=Task, FPR=specificity, TPR=sensitivity) %>%
mutate(FPR=1-FPR)
tm<-predict(fit, newdata=RFdata[subset(RFmeta, Dataset==Task)$SampleID,], type="prob")[,2] %>%
prediction(., subset(RFmeta, Dataset==Task)$Group) %>%
performance(., "auc")
AUCs[[paste0(Task,"_", i)]]<-tibble(Iteration=i, Task=Task, AUC=tm@y.values[[1]])
}
}Check class balance of sampled splits.
bind_rows(splits) %>% filter(Dataset=="Training") %>% mutate(Ratio=Case/Control) %>% summarise_at("Ratio", lst(mean,median,min,max,sd))## # A tibble: 1 × 6
## Dataset mean median min max sd
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Training 0.961 0.958 0.805 1.17 0.0823
bind_rows(ROCR_ROCs) %>%
filter(Task!="Training") %>%
ggplot(aes(x=FPR, y=TPR, color=Task, group=paste(Task, Iteration))) +
geom_abline(linetype="dashed", color="grey50") +
geom_line()ggsave("manuscript_figures/ROC_indiv.pdf", height=2, width=3, useDingbats=F)
bind_rows(pROC_ROCs) %>%
filter(Task!="Training") %>%
ggplot(aes(x=FPR, y=TPR, fill=Task)) +
geom_abline(linetype="dashed", color="grey50") +
stat_summary(geom="ribbon", alpha=0.5, fun.data=mean_sd) +
stat_summary(geom="line", linetype="dashed")AUC values and Importance Scores for inset:
bind_rows(AUCs) %>% group_by(Task) %>% summarize(AUC_mean=mean(AUC), AUC_median=median(AUC), AUC_sd=sd(AUC), AUC_min=min(AUC), AUC_max=max(AUC)) %>% interactive_table()bind_rows(AUCs) %>% group_by(Task) %>% summarize(AUC_mean=mean(AUC), AUC_median=median(AUC), AUC_sd=sd(AUC), AUC_min=min(AUC), AUC_max=max(AUC)) %>% write_tsv("manuscript_figures/AUROCS.tsv")
bind_rows(Importance) %>%
group_by(Taxon) %>%
summarize_at("MeanDecreaseGini", lst(mean,sd,median,min,max)) %>%
arrange(desc(mean)) %>%
interactive_table()bind_rows(Importance) %>%
group_by(Taxon) %>%
summarize_at("MeanDecreaseGini", lst(mean,sd,median,min,max)) %>%
arrange(desc(mean)) %>%
mutate(Rank=1:nrow(.)) %>%
mutate(Taxon=factor(Taxon, levels=rev(Taxon))) %>%
ggplot(aes(x=Rank, y=mean, ymin=mean-sd, ymax=mean+sd)) +
geom_ribbon(fill="grey80") +
geom_line(linetype="dashed", color="grey20") +
#geom_errorbar(width=0) +
#geom_point() +
coord_flip() +
scale_x_reverse() +
ylab("Mean Decrease GINI (mean ± SD)")bind_rows(Importance) %>%
group_by(Taxon) %>%
summarize_at("MeanDecreaseGini", lst(mean,sd,median,min,max)) %>%
arrange(desc(mean)) %>%
mutate(Rank=1:nrow(.)) %>%
filter(Rank<20) %>%
mutate(Taxon=factor(Taxon, levels=rev(Taxon))) %>%
ggplot(aes(x=Taxon, y=mean, ymin=mean-sd, ymax=mean+sd)) +
geom_errorbar(width=0) +
geom_point() +
coord_flip() +
ylab("Mean Decrease GINI (mean ± SD)")